In [34]:
from sentence_transformers import SentenceTransformer
import json
import pandas as pd
from sklearn.mixture import GaussianMixture as GMM
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [35]:
with open('../data/cleaned_comments.json') as comments_file:
    comments = json.load(comments_file)

In [36]:
# convert comments into a dataframe
df = pd.DataFrame({'comment': comments})

In [37]:
# Downloads the transformer to your device. After the first time running will load it from device
model = SentenceTransformer('all-MiniLM-L6-v2')

In [38]:
# convert text into embeddings
embeddings = df['comment'].map(lambda x: model.encode(x))

In [39]:
# converts embeddings into a dataframe
embeddings = embeddings.to_frame()
embeddings = embeddings['comment'].apply(pd.Series)

In [40]:
embeddings_df = pd.DataFrame(embeddings)

In [41]:
# fit the GMM model
gmm_10 = GMM(n_components=10, covariance_type='diag', max_iter=2000, random_state=0).fit(embeddings_df)
gmm_5 = GMM(n_components=5, covariance_type='diag', max_iter=2000, random_state=0).fit(embeddings_df)
gmm_2 = GMM(n_components=2, covariance_type='diag', max_iter=2000, random_state=0).fit(embeddings_df)
# Predict the labels for the data samples
labels_10 = gmm_10.predict(embeddings_df)
labels_5 = gmm_5.predict(embeddings_df)
labels_2 = gmm_2.predict(embeddings_df)

In [42]:
df_10 = df.copy()
df_5 = df.copy()
df_2 = df.copy()
# Save data frame with labels
df_10['label'] = labels_10
df_5['label'] = labels_5
df_2['label'] = labels_2

# Sort by label
df_10 = df_10.sort_values(by='label')
df_5 = df_5.sort_values(by='label')
df_2 = df_2.sort_values(by='label')

In [43]:
df_2.to_csv('../clustered_comments/2_clustered_comments.csv', index=False)
df_5.to_csv('../clustered_comments/5_clustered_comments.csv', index=False)
df_10.to_csv('../clustered_comments/10_clustered_comments.csv', index=False)