In [2]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
import os
import random
import numpy as np
import sys
sys.path.append("../../../TextModel/")
from utils import *
from torch.utils.data import Dataset, DataLoader
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import KernelPCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
setSeeds()
batch_size = 16

In [4]:
def load_language_model(num_parameters:int=7, baseline:int=1):
    """
    baseline:
        BERT
    best:
        sentence-transformers/all-MiniLM-L6-v2
    second:
        sentence-transformers/paraphrase-MiniLM-L6-v2
    """
    if baseline:
        model = AutoModel.from_pretrained(f"bert-base-uncased")
    else:
        available_LlamaModel_parameters = [7, 13, 33, 65]
        assert num_parameters in available_LlamaModel_parameters, f"{num_parameters}B size model not exists"
        model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') # 제일 잘 됨
        
    
    return model.to(device)

def load_tokenizer(num_parameters:int=7, baseline:int=1):
    """
    baseline:
        BERT
    best:
        sentence-transformers/all-MiniLM-L6-v2
    second:
        sentence-transformers/paraphrase-MiniLM-L6-v2
    """
    if baseline:
        tokenizer = AutoTokenizer.from_pretrained(f"bert-base-uncased")
    else:
        available_LlamaModel_parameters = [7, 13, 33, 65]
        assert num_parameters in available_LlamaModel_parameters, f"{num_parameters}B size model not exists"
        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') # 제일 잘 됨
    
    return tokenizer

def tokenize(text:str, tokenizer, max_length:int=120):
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    encoded_input = tokenizer.encode(
                text,
                add_special_tokens=True,
                return_tensors='pt', # 설정하면 (120) shape의 텐서로 저장함
                padding="max_length",
                max_length=max_length,
                truncation=True,
                )
    return encoded_input

In [6]:
language_model = load_language_model(baseline=0)
tokenizer = load_tokenizer(baseline=0)

Downloading (…)lve/main/config.json: 100%|██████████| 612/612 [00:00<00:00, 107kB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:06<00:00, 13.7MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 95.3kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 651kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.49MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 178kB/s]


In [7]:
data = pd.read_csv("../../Data/TMDB/tmdb_5000_movies.csv")

In [9]:
data = data.dropna(subset=['overview'])

In [27]:
data['len_splited_overview'] = data.overview.apply(lambda x:len(x.split()))
sentence_lengths = list(data['len_splited_overview'])
margin_length = 10
max_seq_length = int(np.mean(sentence_lengths) + 2 * np.std(sentence_lengths)) + margin_length

In [28]:
data.loc[:, 'tokenized_overview'] = data.overview.apply(lambda x:tokenize(x, tokenizer, max_seq_length))

In [29]:
len(data.tokenized_overview)

4800

In [30]:
texts = torch.stack(list(data.tokenized_overview)).to(device)

In [32]:
texts = texts.squeeze(dim=1)

In [34]:
text_dataloader = DataLoader(texts, batch_size=batch_size)

In [36]:
emb = []

In [38]:
from tqdm import tqdm

for i, batch in tqdm(enumerate(text_dataloader)):
    t_emb = language_model(batch)
    t_emb = t_emb['last_hidden_state'][:, 0, :].cpu().detach().numpy()
    emb.append(t_emb)

300it [00:01, 197.86it/s]


In [41]:
emb = np.concatenate(emb, axis=0)

In [147]:
np.save('../text_embedding.npy', emb)

In [85]:
tsne = TSNE(n_components=3)
tsne_embeddings = tsne.fit_transform(emb)

In [None]:
n = 40

kmeans = KMeans(n_clusters=n, random_state=0)
kmeans.fit(tsne_embeddings)
clusters = kmeans.predict(tsne_embeddings)

print(f"number of generarated clusters: {len(set(clusters))}")

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], tsne_embeddings[:, 2], c=clusters, cmap='tab20')
ax.set_title(f'n_clusters={n} Clustering Result')
fig.savefig(f'./tsne_cluster_results/{n}.png')

from collections import Counter
cnt = Counter(clusters)
val = cnt.values()
print(f'n_clusters={n} Clustering Result:')
print(dict(sorted(cnt.items(), key=lambda x:(x[1], x[0]))))
max_ = max(val)
min_ = min(val)
avg_ = sum(val) // len(val)
ratio = round(abs(max_ - min_) / avg_, 3)
s_score = silhouette_score(tsne_embeddings, clusters)
print(f"{max_=}   {min_=}   {avg_=}   {ratio=}   {s_score=}")
print("================================")

In [154]:
data = data.reset_index()

clusters_df = pd.DataFrame(clusters, columns=['cluster_id'])
cluster_data = pd.concat([data[['original_title', 'overview', 'title']], clusters_df], axis=1)

In [155]:
cluster_data.to_csv('../cluster_result/cluster_data.csv', index=False)

In [156]:
cluster_data[cluster_data.cluster_id == 10]

Unnamed: 0,original_title,overview,title,cluster_id
34,Monsters University,A look at the relationship between Mike and Su...,Monsters University,10
159,Spider-Man,After being bitten by a genetically altered sp...,Spider-Man,10
219,Charlie's Angels: Full Throttle,The Angels are charged with finding a pair of ...,Charlie's Angels: Full Throttle,10
381,The Nutcracker: The Untold Story,"Set in 1920's Vienna, this a tale of a little ...",The Nutcracker: The Untold Story,10
483,Timeline,A group of archaeological students become trap...,Timeline,10
...,...,...,...,...
4653,Butterfly,"Orson Welles, as judge Rauch, holds a lengthy ...",Butterfly,10
4712,Sound of My Voice,A journalist and his girlfriend get pulled in ...,Sound of My Voice,10
4721,Eraserhead,Henry Spencer tries to survive his industrial ...,Eraserhead,10
4764,"Run, Hide, Die",On the anniversary weekend of the death of a y...,"Run, Hide, Die",10


In [None]:
# for n in range(5, 400):
#     kmeans = KMeans(n_clusters=n, random_state=0)
#     kmeans.fit(tsne_embeddings)
#     clusters = kmeans.predict(tsne_embeddings)

#     print(f"number of generarated clusters: {len(set(clusters))}")

#     fig = plt.figure()
#     ax = fig.add_subplot(111, projection='3d')
#     ax.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], tsne_embeddings[:, 2], c=clusters, cmap='tab20')
#     ax.set_title(f'n_clusters={n} Clustering Result')
#     fig.savefig(f'./tsne_cluster_results/{n}.png')

#     from collections import Counter
#     cnt = Counter(clusters)
#     val = cnt.values()
#     print(f'n_clusters={n} Clustering Result:')
#     print(dict(sorted(cnt.items(), key=lambda x:(x[1], x[0]))))
#     max_ = max(val)
#     min_ = min(val)
#     avg_ = sum(val) // len(val)
#     ratio = round(abs(max_ - min_) / avg_, 3)
#     s_score = silhouette_score(tsne_embeddings, clusters)
#     print(f"{max_=}   {min_=}   {avg_=}   {ratio=}   {s_score=}")
#     print("================================")
# fig.show()

In [139]:
# kpca = KernelPCA(n_components=3, kernel='rbf')
# embeddings_reduced = kpca.fit_transform(emb)

In [None]:
# for n in range(100, 400):
#     kmeans = KMeans(n_clusters=n, random_state=0)
#     kmeans.fit(embeddings_reduced)
#     clusters = kmeans.predict(embeddings_reduced)

#     print(f"number of generarated clusters: {len(set(clusters))}")

#     fig = plt.figure()
#     ax = fig.add_subplot(111, projection='3d')
#     ax.scatter(embeddings_reduced[:, 0], embeddings_reduced[:, 1], embeddings_reduced[:, 2], c=clusters, cmap='tab20')
#     ax.set_title(f'n_clusters={n} Clustering Result')
#     fig.savefig(f'./kpca_cluster_results/{n}.png')

#     from collections import Counter
#     cnt = Counter(clusters)
#     val = cnt.values()
#     print(f'n_clusters={n} Clustering Result:')
#     print(dict(sorted(cnt.items(), key=lambda x:(x[1], x[0]))))
#     max_ = max(val)
#     min_ = min(val)
#     avg_ = sum(val) // len(val)
#     ratio = round(abs(max_ - min_) / avg_, 3)
#     s_score = silhouette_score(tsne_embeddings, clusters)
#     print(f"{max_=}   {min_=}   {avg_=}   {ratio=}   {s_score=}")
#     print("================================")
# fig.show()