In [1]:
import pandas as pd
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df= pd.read_csv("//content/drive/MyDrive/Colab Notebooks/ML_team/movies_metadata.csv", low_memory=False)
print(df.head())

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497         

In [4]:
# overview에서 Null 값을 가진 경우에는 값 제거
df = df.dropna(subset=['overview'])

In [5]:
# 각 장르를 독립적인 카테고리로 간주하고, one-hot encoding 수행
df_genres = df['genres'].apply(lambda x: ['genre_'+d['name'] for d in ast.literal_eval(x)])
df = df.join(pd.get_dummies(df_genres.apply(pd.Series).stack()).sum(level=0))

  df = df.join(pd.get_dummies(df_genres.apply(pd.Series).stack()).sum(level=0))
  df = df.join(pd.get_dummies(df_genres.apply(pd.Series).stack()).sum(level=0))


In [6]:
# 장르 칼럼 선택
genres_columns = df.columns[df.columns.str.contains('genre_')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44512 entries, 0 to 45465
Data columns (total 56 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   adult                                        44512 non-null  object 
 1   belongs_to_collection                        4435 non-null   object 
 2   budget                                       44512 non-null  object 
 3   genres                                       44512 non-null  object 
 4   homepage                                     7765 non-null   object 
 5   id                                           44512 non-null  object 
 6   imdb_id                                      44497 non-null  object 
 7   original_language                            44502 non-null  object 
 8   original_title                               44512 non-null  object 
 9   overview                                     44512 non-null  object 
 10

In [7]:
# release_date를 datetime 형식으로 변환합니다.
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

In [8]:
# 2000년대 이상의 영화만 선택
recent_movies = df[df['release_date'].dt.year >= 2000].head(500)

In [9]:
# 사용자 입력 받기
# user_input = input("영화를 찾고 싶은 단어를 입력하세요: ")
user_input = "Park"

In [10]:
# TF-IDF 벡터화를 수행합니다.
transformer = TfidfVectorizer()
tfidf_matrix = transformer.fit_transform(recent_movies['overview'].tolist() + [user_input])
print(tfidf_matrix.shape)


(501, 6620)


In [11]:
#사용자 입력에 대한 TF-IDF 변환
user_input_tfidf = tfidf_matrix[-1]

In [12]:
# 영화 개요에 대한 TF-IDF 변환
tfidf_matrix = tfidf_matrix[:-1]

In [13]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

(500, 500)


In [14]:
# Tokenizer를 사용하여 한국어 문장을 단어 인덱스로 변환합니다.
max_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(recent_movies['overview'])
total_words = len(tokenizer.word_index) + 1

In [15]:
#영화 overviews를 시퀀스로 변환합니다.
sequences = tokenizer.texts_to_sequences(recent_movies['overview'])
X = pad_sequences(sequences, maxlen=max_length)

In [37]:
# CNN 모델 정의
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=max_length))
model.add(Conv1D(128, 10, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(len(genres_columns), activation='sigmoid'))

In [38]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
# 학습, 검증 데이터 나누기
train_indices, val_indices = train_test_split(df.index, test_size=0.2, random_state=42)

In [40]:
# train_indices를 희소 행렬의 크기에 맞게 수정
train_indices = train_indices[train_indices < tfidf_matrix.shape[0]]

In [41]:
# 학습 데이터로 모델 학습
train_cosine_sim = cosine_similarity(tfidf_matrix[train_indices], tfidf_matrix[train_indices])

In [42]:
# val_indices를 희소 행렬의 크기에 맞게 수정
val_indices = val_indices[val_indices < tfidf_matrix.shape[0]]

In [43]:
# 검증 데이터로 예측
val_cosine_sim = cosine_similarity(tfidf_matrix[val_indices], tfidf_matrix[train_indices])

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, recent_movies[genres_columns], test_size=0.2, random_state=42)


In [45]:
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)

y_train_filled = y_train_df.fillna(0).values
y_test_filled = y_test_df.fillna(0).values

y_train = y_train_filled
y_test = y_test_filled

In [46]:
#CNN 모델 학습
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79f61a105180>

In [47]:
from tensorflow.keras.losses import binary_crossentropy
import tensorflow as tf

y_pred = model.predict(X_test)

print(y_pred)
loss = binary_crossentropy(y_test, y_pred)
loss_result = tf.reduce_mean(loss).numpy()

print("Calculated loss: ", loss_result)


[[0.58053094 0.393728   0.03784676 ... 0.01191014 0.00906633 0.01003568]
 [0.5933087  0.3811358  0.03298009 ... 0.00963468 0.00725526 0.00822315]
 [0.6040073  0.37504482 0.03141655 ... 0.00862802 0.00624819 0.00774136]
 ...
 [0.57545525 0.39135757 0.04618835 ... 0.01565653 0.01191057 0.0136722 ]
 [0.58814067 0.3852801  0.06764209 ... 0.02633824 0.02425611 0.02256702]
 [0.5722622  0.39189968 0.04718969 ... 0.01635393 0.01256462 0.01411913]]
Calculated loss:  0.22782846


In [48]:
#TF-IDF 기반 추천 함수 정의
def recommend_movies(user_input):
    global cosine_sim_user_input
    # 사용자 입력을 토큰화하여 시퀀스로 변환
    user_input_sequence = tokenizer.texts_to_sequences([user_input])
    user_input_padded = pad_sequences(user_input_sequence, maxlen=max_length)

    # 사용자 입력에 대한 TF-IDF 변환
    user_input_tfidf = transformer.transform([user_input])
    # nan이 아닌 단어에 대해서만 추천 수행
    if not user_input_tfidf.nnz:
        print("입력한 단어로는 추천할 수 없습니다.")
        return
    cosine_sim_user_input = cosine_similarity(user_input_tfidf, tfidf_matrix).flatten()
    top_similar_movies_indices = cosine_sim_user_input.argsort()[-5:][::-1]

    #상위 5개의 유사한 영화 출력
    recommended_movies = recent_movies.iloc[top_similar_movies_indices, recent_movies.columns.get_loc('original_title')]
    print("입력한 단어와 유사한 영화 Top 5:")
    for i, movie_title in enumerate(recommended_movies):
        print(f"{i + 1}. {movie_title}")

In [49]:
#TF-IDF 기반 추천 실행
recommend_movies(user_input)

입력한 단어와 유사한 영화 Top 5:
1. Prince of Central Park
2. Black Knight
3. Joe Dirt
4. The Independent
5. Requiem for a Dream


In [50]:
#CNN 기반 추천
user_input_sequence = tokenizer.texts_to_sequences([user_input])
print("user_input_sequence:", user_input_sequence)
user_input_padded = pad_sequences(user_input_sequence, maxlen=max_length)
print("user_input_padded:", user_input_padded)
user_similarity = model.predict(user_input_padded).mean(axis=0)
print("user_similarity:", user_similarity)

user_input_sequence: [[1445]]
user_input_padded: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0 1445]]
user_similarity: [0.54444885 0.391947   0.0884507  0.0441773  0.03690684 0.07735203
 0.44045424 0.2809626  0.12847957 0.46260566 0.13379312 0.11884168
 0.07433356 0.05665328 0.4746132  0.09686777 0.03364377 0.05886022
 0.17118892 0.04299654 0.02287718 0.01459227 0.3959989  0.09025907
 0.0304838  0.15688618 0.03186056 0.08392624 0.15672195 0.04242572
 0.03675268 0.03627064]


In [51]:
average_cosine_sim_user_input = np.array([np.mean(cosine_sim_user_input)])
print("average_cosine_sim_user_input shape:", average_cosine_sim_user_input.shape)

average_cosine_sim_user_input shape: (1,)


In [52]:
# 기존 코사인 유사도와 CNN 기반 유사도를 조합하여 최종 추천
final_similarity = 0.3 * user_similarity + 0.7 * average_cosine_sim_user_input
print("user_similarity shape:", user_similarity.shape)
print("average_cosine_sim_user_input shape:", average_cosine_sim_user_input.shape)

user_similarity shape: (32,)
average_cosine_sim_user_input shape: (1,)


In [53]:
user_similarity.shape

(32,)

In [54]:
cosine_sim_user_input.shape

(500,)

In [55]:
# 상위 5개의 유사한 영화 찾기
top_similar_movies_indices = final_similarity.argsort()[-5:][::-1]

In [56]:
top_similar_movies_indices

array([ 0, 14,  9,  6, 22])

In [57]:
# 상위 5개의 유사한 영화 출력
recommended_movies = recent_movies.iloc[top_similar_movies_indices]
print("입력한 단어와 유사한 영화 Top 5:")
for i, movie_title in enumerate(recommended_movies['original_title']):
    print(f"{i + 1}. {movie_title}")

입력한 단어와 유사한 영화 Top 5:
1. Due Amici
2. Isn't She Great
3. My Dog Skip
4. Romance
5. Boiler Room
