In [1]:
import fasttext.util
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

  from .autonotebook import tqdm as notebook_tqdm


#### Load the embedding models

In [2]:
embedding_model = fasttext.load_model("cc.en.300.bin")
wordnet_lemmatizer = WordNetLemmatizer()
sentence_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#### Load the aspects

In [None]:
sampled_aspects = pd.read_pickle('../data/product_review_aspects.pkl')
sampled_aspects['Aspect'] = sampled_aspects['Aspect'].str.capitalize()

In [5]:
sampled_aspects.columns

Index(['ReviewId', 'AspectId', 'Aspect', 'Sentiment', 'RepresentativeSentence',
       'ProductFamilyId'],
      dtype='object')

In [6]:
aspect_count = defaultdict(int)

In [7]:
skip_words = set(['of', 'it', 'to'])

def get_mean_embedding(phrase):
    phrase = phrase.lower()
    res = np.zeros(300)
    num_words = 0
    for word in filter(lambda w: not w in skip_words, phrase.split(" ")):
        res += embedding_model.get_word_vector(word)
        num_words += 1
    
    return res/num_words if num_words >= 1 else res

def get_mean_sentence_embedding(sentences):
    list_embeddings = sentence_embedding_model.encode(
        sentences,
        batch_size=192,
        device=0,
        show_progress_bar=False,
    )
    
    np_embeddings = np.array(list_embeddings)
    mean_embedding = np_embeddings.sum(axis=0)/(np.size(np_embeddings, 0))
    
    return list(mean_embedding)

In [8]:
sampled_aspects['aspect_case_converted'] = sampled_aspects['Aspect'].str.lower()

aspect_embeddings = sentence_embedding_model.encode(
    sampled_aspects['aspect_case_converted'].tolist(),
    batch_size=192,
    device=0,
    show_progress_bar=False,
)


In [9]:
sampled_aspects['aspect_embeddings'] = aspect_embeddings.tolist()

In [10]:
sampled_aspects['sentence_embeddings'] = sentence_embedding_model.encode(
    sampled_aspects['RepresentativeSentence'].tolist(),
    batch_size=192,
    device=0,
    show_progress_bar=False,
).tolist()

In [11]:
sampled_aspects['combined_embeddings'] = sampled_aspects['aspect_embeddings'] + sampled_aspects['sentence_embeddings']
sampled_aspects['combined_embeddings'].shape

(17331,)

In [12]:
from scipy import spatial
from sklearn import metrics

def do_dbscan_clustering(df, family_id, combine_embs=False):    
    dbscan_model = DBSCAN(
        eps=0.2, 
        min_samples=2, 
        metric='cosine'
    )
    output = None
    silhouette_score = 0
    ch_score = 0
    db_score = 0
    
    if df.shape[0] > 1:
    
        if combine_embs:
            labels = dbscan_model.fit_predict(np.array(df["combined_embeddings"].values.tolist()))
        else:
            labels = dbscan_model.fit_predict(np.array(df["aspect_embeddings"].values.tolist()))

        features = np.array(df["Aspect"].values.tolist())
        output = np.vstack((features, labels)).T
        
        df['labels'] = dbscan_model.labels_
        
        output_list = []
        aid = 0
        for feat, lab in zip(features, labels):
            # print(f"{feat}, {lab}")
            output_clusters[lab][feat] = 1 + output_clusters[lab].get(feat, 0)
            output_list.append([aid, feat, lab])
            aid += 1
            
        output_df = pd.DataFrame(output_list, columns=['aid', 'aspect', 'label'])
        # output_df = output_df[output_df['label'] > 0]
        
        # Group by 'category' and count the occurrences
        grouped = output_df.groupby('label').size().reset_index(name='count')

        # Sort by count in descending order
        sorted_groups = grouped.sort_values(by='count', ascending=False)

        # Select the top 10 groups
        top_10_groups = sorted_groups.head(min(10, len(output_df.label.unique())))
        
        output_df_filtered = output_df[output_df['label'].isin(top_10_groups.label.unique())]
        
        output_df_filtered["aspect_embeddings"] = sentence_embedding_model.encode(
            output_df_filtered['aspect'].tolist(),
            batch_size=192,
            device=0,
            show_progress_bar=False,
        ).tolist()
        
        if len(set(output_df_filtered['label'].values)) >= 2:
            if combine_embs:
                silhouette_score = metrics.silhouette_score(np.array(df["combined_embeddings"].values.tolist()), output_df_filtered['label'].values, metric='cosine')
                ch_score = metrics.calinski_harabasz_score(np.array(df["combined_embeddings"].values.tolist()), output_df_filtered['label'].values)
                db_score = metrics.davies_bouldin_score(np.array(df["combined_embeddings"].values.tolist()), output_df_filtered['label'].values)
            else:
                silhouette_score = metrics.silhouette_score(np.array(output_df_filtered["aspect_embeddings"].values.tolist()), output_df_filtered['label'].values, metric='cosine')
                ch_score = metrics.calinski_harabasz_score(np.array(output_df_filtered["aspect_embeddings"].values.tolist()), output_df_filtered['label'].values)
                db_score = metrics.davies_bouldin_score(np.array(output_df_filtered["aspect_embeddings"].values.tolist()), output_df_filtered['label'].values)
    return silhouette_score, ch_score, db_score, output

In [13]:
import math

output_clusters = defaultdict(dict)
silhouette_scores = []
ch_scores = []
db_scores = []
family_ids = sampled_aspects.ProductFamilyId.unique()
for family_id in family_ids:
    pf1 = sampled_aspects.loc[sampled_aspects["ProductFamilyId"] == family_id]
    for sentiment in ["Positive", "Negative"]:
        pf1_pos = pf1.loc[pf1["Sentiment"] == sentiment]
        sil_score, ch_score, db_score, output = do_dbscan_clustering(pf1_pos.copy(), family_id)
        if sil_score > 0:
            silhouette_scores.append(sil_score)
        if ch_score > 0:
            ch_scores.append(ch_score)
        if db_score > 0:
            db_scores.append(db_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df_filtered["aspect_embeddings"] = sentence_embedding_model.encode(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df_filtered["aspect_embeddings"] = sentence_embedding_model.encode(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df_filtered["aspect_embeddings"] = sentence_embe

In [14]:
print(f"{round(np.mean(silhouette_scores), 2)} \pm {round(np.std(silhouette_scores), 2)}[{round(np.min(silhouette_scores), 2)}, {round(np.max(silhouette_scores), 2)}]")
print(f"{round(np.mean(ch_scores), 2)} \pm {round(np.std(ch_scores), 2)}[{round(np.min(ch_scores), 2)}, {round(np.max(ch_scores), 2)}]")
print(f"{round(np.mean(db_scores), 2)} \pm {round(np.std(db_scores), 2)}[{round(np.min(db_scores), 2)}, {round(np.max(db_scores), 2)}]")

0.35 \pm 0.18[0.01, 0.84]
8.71 \pm 18.08[1.0, 234.36]
1.07 \pm 0.17[0.54, 1.46]
