In [3]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
import os
import random
import numpy as np
import sys
sys.path.append("../../TextModel/")
from utils import *
from torch.utils.data import Dataset, DataLoader
from sklearn.cluster import DBSCAN
from sklearn.decomposition import KernelPCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score
import warnings
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from surprise import SVD, NMF, Dataset, Reader
warnings.filterwarnings("ignore")

# Relation 1: is_similar_storyline
- storyline에 대한 text embedding을 구한다.
- text embedding간의 cosine similarity를 구한다.
- 유사도가 threshold 이상이 되는 text pair만을 '유사하다'고 정의한다.

In [5]:
movies = pd.read_csv("../../Data/TMDB/tmdb_5000_movies.csv")
movies = movies.dropna(subset=['overview'])
movies = movies.reset_index()
emb = np.load("./text_embedding/text_embedding.npy")

credits = pd.read_csv("../../Data/TMDB/tmdb_5000_credits.csv")
credits.rename(columns={'movie_id':'id'}, inplace=True)
tmp = pd.DataFrame(movies[['id', 'original_title']])
credits = pd.merge(left=credits, right=tmp, on='id', how='inner')

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# 각 row 간의 cosine similarity 계산
cos_sim = cosine_similarity(emb)
result_dict = defaultdict(list)
threshold = 0.85

for i in range(0, len(cos_sim) - 1):
    for j in range(i + 1, len(cos_sim)):
        if cos_sim[i][j] >= threshold:
            result_dict[i].append((j, cos_sim[i][j]))

In [7]:
result_list = []

for key, value in result_dict.items():
    movie_idx = key
    movie_title = movies.iloc[movie_idx]['original_title']
    movie_title_ = movies.iloc[movie_idx]['title']
    for sim_movie_idx, sim in value:
        sim_movie_title = movies.iloc[sim_movie_idx]['original_title']
        result_list.append([movie_title, sim_movie_title, sim, movie_title_])

In [8]:
text_sim_df = pd.DataFrame(result_list, columns=['original_title', 'similar_movie_title', 'cosine_similarity', 'title'])

In [9]:
len(result_list)

88069

In [10]:
len(result_dict)

2753

In [11]:
text_sim_df.head(1)

Unnamed: 0,original_title,similar_movie_title,cosine_similarity,title
0,Pirates of the Caribbean: At World's End,Penguins of Madagascar,0.861668,Pirates of the Caribbean: At World's End


# Relation 2: cluster_group_of
- storyline에 대한 text embedding을 구한다.
- text embedding들을 t-sne로 차원 축소 이후
- 차원 축소된 vector들을 k-means 알고리즘으로 클러스터링

In [14]:
tsne = TSNE(n_components=3)
tsne_embeddings = tsne.fit_transform(emb)

In [None]:
n = 30

kmeans = KMeans(n_clusters=n, random_state=0)
kmeans.fit(tsne_embeddings)
clusters = kmeans.predict(tsne_embeddings)

print(f"number of generarated clusters: {len(set(clusters))}")

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], tsne_embeddings[:, 2], c=clusters, cmap='tab20')
ax.set_title(f'n_clusters={n} Clustering Result')
fig.savefig(f'./{n}.png')

from collections import Counter
cnt = Counter(clusters)
val = cnt.values()
print(f'n_clusters={n} Clustering Result:')
print(dict(sorted(cnt.items(), key=lambda x:(x[1], x[0]))))
max_ = max(val)
min_ = min(val)
avg_ = sum(val) // len(val)
ratio = round(abs(max_ - min_) / avg_, 3)
s_score = silhouette_score(tsne_embeddings, clusters)
print(f"{max_=}   {min_=}   {avg_=}   {ratio=}   {s_score=}")
print("================================")
fig.show()

In [None]:
clusters_df = pd.DataFrame(clusters, columns=['cluster_id'])
cluster_data = pd.concat([movies[['original_title', 'overview', 'title']], clusters_df], axis=1)

In [19]:
cluster_data = pd.read_csv("./cluster_result/cluster_data.csv")

# Relation 3: is_director
- tmdb_credits에서 json parsing을 통해 'Director' 획득

In [20]:
from collections import defaultdict
import json

movietitle_director_dict = defaultdict(list)
movietitle_director_dict_ = defaultdict(list)

for row in credits.iterrows():
    row_data = row[1]
    movie_title = row_data.original_title
    movie_title_ = row_data.title
    json_parsing = json.loads(row_data.crew)
    tmp = []
    for crew_data in json_parsing:
        if crew_data['job'] == 'Director':
            tmp.append(crew_data['name'])
    movietitle_director_dict[movie_title] = tmp
    movietitle_director_dict_[movie_title_] = tmp

In [21]:
movietitle_director_df = pd.DataFrame([(title, director, title_) for (title, directors), (title_, directors_) in zip(movietitle_director_dict.items(), movietitle_director_dict_.items()) for director in directors], columns=['original_title', 'director_name', 'title'])

# 영화 제목 전처리
- 영화 제목을 전처리한다.
- 이를 통해 MovieLens 데이터와 merge될 수 있도록 한다.

In [37]:
def tmdb_preprocess_title(df:pd.DataFrame):
    if 'original_title' in df.columns:
        df.title = df.title.apply(lambda x: x.lower())
        df.original_title = df.original_title.apply(lambda x: x.lower())
        
        df.loc[df['title'] =='america is still the place', 'original_title'] = 'america is still the place'
        df.drop(columns='title', inplace=True)
        df.rename(columns={'original_title':'title'}, inplace=True)

        # tmdb에서 title 같은데, 출시일자가 달라서 다르고 줄거리는 같은 영화는 중복 제거
        df.drop_duplicates(['title'], keep='first', inplace=True)
        
        if 'similar_movie_title' in df.columns:
            df['similar_movie_title'] = df['similar_movie_title'].apply(lambda x: x.lower())
            
            df.loc[df['similar_movie_title'] =='america is still the place', 'similar_movie_title'] = 'america is still the place'
            # tmdb에서 title 같은데, 출시일자가 달라서 다르고 줄거리는 같은 영화는 중복 제거
            df.drop_duplicates(['similar_movie_title'], keep='first', inplace=True)
            df.rename(columns={'similar_movie_title':'similar_title'}, inplace=True)
    elif 'cosine_similarity' in df.columns:
        df['movie_title'] = df['movie_title'].apply(lambda x: x.lower())
        
        df.loc[df['movie_title'] =='america is still the place', 'movie_title'] = 'america is still the place'
        # tmdb에서 title 같은데, 출시일자가 달라서 다르고 줄거리는 같은 영화는 중복 제거
        df.drop_duplicates(['movie_title'], keep='first', inplace=True)
        df.rename(columns={'movie_title':'title'}, inplace=True)
    return df

In [38]:
text_sim_df = tmdb_preprocess_title(text_sim_df)
cluster_data = tmdb_preprocess_title(cluster_data)
movietitle_director_df = tmdb_preprocess_title(movietitle_director_df)

In [40]:
text_sim_df.to_csv("./preprocess_phase_0/similarity.csv", index=False)
cluster_data.to_csv("./preprocess_phase_0/cluster.csv", index=False)
movietitle_director_df.to_csv("./preprocess_phase_0/director.csv", index=False)

# NMF 기반 예측

In [None]:
ratings = pd.read_csv("../../Data/MovieLens/ratings.csv")
# 데이터 불러오기
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

# SVD 모델 학습
model = NMF(n_factors=50, n_epochs=30, biased=False)
trainset = data.build_full_trainset()
model.fit(trainset)

# 사용자 임베딩 추출
user_id = 1
user_embedding = model.pu[user_id]

# Relation 4: is_similar_user
- surprise 라이브러리의 NMF 기반 평점 예측
- user embedding 구함
- 유사도가 threshold 이상이 되는 user pair만을 '유사하다'고 정의한다.

In [None]:
user_emb = model.pu

# user embedding에 대한 각 row 간의 cosine similarity 계산
cos_sim = cosine_similarity(user_emb)
result_dict = defaultdict(list)
threshold = 0.90

for i in range(0, len(cos_sim) - 1):
    for j in range(i + 1, len(cos_sim)):
        if cos_sim[i][j] >= threshold:
            result_dict[i].append((j, cos_sim[i][j]))

result_list = []

for key, value in result_dict.items():
    idx = key
    for sim_idx, sim in value:
        result_list.append([idx, sim_idx, sim])

user_sim_df = pd.DataFrame(result_list, columns=['user_id', 'similar_user_id', 'cosine_similarity'])

# Relation 5: is_similar_item
- surprise 라이브러리의 NMF 기반 평점 예측
- item embedding 구함
- 유사도가 threshold 이상이 되는 item pair만을 '유사하다'고 정의한다.

In [None]:
item_emb = model.qi

# item embedding에 대한 각 row 간의 cosine similarity 계산
cos_sim = cosine_similarity(item_emb)
result_dict = defaultdict(list)
threshold = 0.90

for i in range(0, len(cos_sim) - 1):
    for j in range(i + 1, len(cos_sim)):
        if cos_sim[i][j] >= threshold:
            result_dict[i].append((j, cos_sim[i][j]))

result_list = []

for key, value in result_dict.items():
    idx = key
    for sim_idx, sim in value:
        result_list.append([idx, sim_idx, sim])

item_sim_df = pd.DataFrame(result_list, columns=['item_id', 'similar_item_id', 'cosine_similarity'])

In [None]:
user_sim_df.to_csv("./preprocess_phase_0/user_sim.csv", index=False)
item_sim_df.to_csv("./preprocess_phase_0/item_sim.csv", index=False)