# Topic-modelling with UMAP and HDBSCAN

### Using SentenceBert (sBert) to embed sentences

In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import tqdm
from tqdm import trange
import random
import umap
import torch
import hdbscan
import hyperopt as hp
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from functools import partial
from hyperopt import space_eval
from sentence_transformers import SentenceTransformer

In [None]:
#read in sentences from a csv
data = pd.read_csv('all-comments-with-time.txt',sep='\t',encoding='utf-8')
data = data.dropna()
time = list(data['MovieTime'])
texts = list(data.iloc[:,0])

In [None]:
#select a suitable model for sentence transformer
#use 'cuda' if it exists
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
if torch.cuda.is_available():
  device = torch.device('cuda')
model = model.to(device)
embeddings = model.encode(texts, show_progress_bar=True,device = device)
with open("all-embeddings-time-based.pkl", "wb") as fOut:
    pickle.dump({'sentences': data, 'embeddings':embeddings},fOut,protocol=4)

### using UMAP to reduce dimensionality, then using HDBSCAN for sentence clustering

In [None]:
samples = pd.read_pickle('samples_of_embeddings.pkl')
embeds = list(samples['Embeddings'])

In [None]:
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeds)

In [None]:
#'min_samples': 500, 'min_cluster_size': 300,'metric': 'manhattan', 'cluster_selection_method': 'eom'
cluster = hdbscan.HDBSCAN(min_cluster_size=300,
                          min_samples = 500,
                          metric='manhattan',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [None]:
umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(list(embeddings))
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

In [None]:

# Visualize clusters (This is calculated by following parameters：'min_samples': 500, 'min_cluster_size': 300,'metric': 'manhattan', 'cluster_selection_method': 'eom')
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
clustered_lbs = clustered['labels']
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
# for i, txt in enumerate(list(clustered_lbs)):
#     plt.annotate(txt, (clustered.iloc[i].x, clustered.iloc[i].y))
plt.colorbar()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import numpy as np
import logging
import warnings
import umap

In [None]:
#using random search to seek for better parameters for clustering

hdb = hdbscan.HDBSCAN(gen_min_span_tree=True).fit(umap_embeddings)
logging.captureWarnings(True)

# specify parameters and distributions to sample from
param_dist = {'min_samples': [10,30,50,60,100],
              'min_cluster_size':[100,200,300,400,500,600],  
              'cluster_selection_method' : ['eom','leaf'],
              'metric' : ['euclidean','manhattan'] 
             }

#validity_scroer = "hdbscan__hdbscan___HDBSCAN__validity_index"
validity_scorer = make_scorer(hdbscan.validity.validity_index,greater_is_better=True)


n_iter_search = 20
random_search = RandomizedSearchCV(hdb
                                   ,param_distributions=param_dist
                                   ,n_iter=n_iter_search
                                   ,scoring=validity_scorer 
                                   ,random_state=np.random.RandomState(42))

random_search.fit(umap_embeddings)


print(f"Best Parameters {random_search.best_params_}")
print(f"DBCV score :{random_search.best_estimator_.relative_validity_}")
