In [1]:
# Load the defaultdict from the file
import pickle
import pandas as pd
from sklearn import metrics
import numpy as np

with open('results-topic-modeling-revieweaver.pkl', 'rb') as f:
    all_results = pickle.load(f)

In [2]:
# all_results.keys()

In [3]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
sentence_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
pd.options.mode.chained_assignment = None

silhouette_scores = []
ch_scores = []
db_scores = []


for family in list(all_results.keys()):
    all_features = all_results[f"{family}"]                              

    if len(all_features) == 0:
        continue

    if len(all_features[0]) == 8:
        distilled_features_family = pd.DataFrame(all_features, columns=["feature_name", "count", "review_ids", "other_names", "feature_id", "quotes", "val", "embedding"])
        distilled_features_family.drop(['val', 'embedding'], axis=1, inplace=True)
    else:
        distilled_features_family = pd.DataFrame(all_features, columns=["feature_name", "count", "review_ids", "other_names", "feature_id", "quotes"])
    distilled_features_family["sentiment"] = distilled_features_family["feature_id"].apply(lambda x: "Positive" if "Positive" in x else "Negative")
    
    
    for sentiment in ["Positive", "Negative"]:
        features = distilled_features_family[distilled_features_family["sentiment"] == sentiment]
        
        # Convert the 'feature_name' column to a categorical data type
        features['feature_name'] = features['feature_name'].astype('category')

        # Assign numerical labels to the categories
        features.loc[:, 'label'] = features['feature_name'].cat.codes 
        
        eval_aspect_list = []
        aid = 0
        
        for i, row in features.iterrows():
            feature = row['feature_name']
            other_names = row['other_names']
            label = row['label']
            
            for name in other_names:
                aspect = name.split('(')[0].lower()
                count = int(name.split('(')[1].strip(')'))

                for j in range(count):
                    eval_aspect_list.append([aid, feature, aspect, label])
                    aid += 1
                    
        eval_aspect_df = pd.DataFrame(eval_aspect_list, columns=['aid', 'feature', 'aspect', 'label'])  
        
        # Group by 'category' and count the occurrences
        grouped = eval_aspect_df.groupby('label').size().reset_index(name='count')

        # Sort by count in descending order
        sorted_groups = grouped.sort_values(by='count', ascending=False)

        # Select the top 10 groups
        top_10_groups = sorted_groups.head(min(10, len(eval_aspect_df.label.unique())))
        
        eval_aspect_df_filtered = eval_aspect_df[eval_aspect_df['label'].isin(top_10_groups.label.unique())]
        
        eval_aspect_df_filtered["aspect_embeddings"] = sentence_embedding_model.encode(
            eval_aspect_df_filtered['aspect'].tolist(),
            batch_size=192,
            device=0,
            show_progress_bar=False,
        ).tolist()
        
        if len(set(eval_aspect_df_filtered['label'].values)) >= 2 and len(set(eval_aspect_df_filtered['label'].values)) < eval_aspect_df_filtered.shape[0]:     
            sil_score = 0
            ch_score = 0
            db_score = 0
    
            sil_score = metrics.silhouette_score(np.array(eval_aspect_df_filtered["aspect_embeddings"].values.tolist()), eval_aspect_df_filtered['label'].values, metric='cosine')
            ch_score = metrics.calinski_harabasz_score(np.array(eval_aspect_df_filtered["aspect_embeddings"].values.tolist()), eval_aspect_df_filtered['label'].values)
            db_score = metrics.davies_bouldin_score(np.array(eval_aspect_df_filtered["aspect_embeddings"].values.tolist()), eval_aspect_df_filtered['label'].values)
            
            if sil_score > 0:
                silhouette_scores.append(sil_score)
            if ch_score > 0:
                ch_scores.append(ch_score)
            if db_score > 0:
                db_scores.append(db_score)

In [7]:
print(f"{round(np.mean(silhouette_scores), 2)} +- {round(np.std(silhouette_scores), 2)}[{round(np.min(silhouette_scores), 2)}, {round(np.max(silhouette_scores), 2)}]")
print(f"{round(np.mean(ch_scores), 2)} +- {round(np.std(ch_scores), 2)}[{round(np.min(ch_scores), 2)}, {round(np.max(ch_scores), 2)}]")
print(f"{round(np.mean(db_scores), 2)} +- {round(np.std(db_scores), 2)}[{round(np.min(db_scores), 2)}, {round(np.max(db_scores), 2)}]")

0.52 +- 0.16[0.05, 0.88]
13.14 +- 18.14[1.0, 206.2]
0.58 +- 0.25[0.0, 1.46]
