#### Load the packages

In [None]:
import fasttext.util
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sentence_transformers import SentenceTransformer

#### Load the embedding models

In [None]:
embedding_model = fasttext.load_model("cc.en.300.bin")
wordnet_lemmatizer = WordNetLemmatizer()
sentence_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#### Load the aspects

In [None]:
sampled_aspects = pd.read_pickle('../data/product_review_aspects.pkl')
sampled_aspects['Aspect'] = sampled_aspects['Aspect'].str.capitalize()

In [None]:
sampled_aspects.columns

#### Similarity parameters

In [None]:
MAX_FEATURES = 200
SIMILARITY = 0.50
SENTENCE_SIMILARITY = 0.40

In [None]:
skip_words = set(['of', 'it', 'to'])

def get_mean_embedding(phrase):
    phrase = phrase.lower()
    res = np.zeros(300)
    num_words = 0
    for word in filter(lambda w: not w in skip_words, phrase.split(" ")):
        res += embedding_model.get_word_vector(word)
        num_words += 1
    
    return res/num_words if num_words >= 1 else res

def get_mean_sentence_embedding(sentences):
    list_embeddings = sentence_embedding_model.encode(
        sentences,
        batch_size=192,
        device=0,
        show_progress_bar=False,
    )
    
    np_embeddings = np.array(list_embeddings)
    mean_embedding = np_embeddings.sum(axis=0)/(np.size(np_embeddings, 0))
    
    return list(mean_embedding)

In [None]:
def remove_duplicates(features):  # [feature, rank[p], node_reviews[p], other_names[p], feature_ids[p]]
    # Use union find to remove duplicate features
    # graph[p][0], rank[p], node_reviews[p], other_names[p], feature_ids[p], quotes[p], p
    parent = [i for i in range(len(features))]
    fids = [features[i][4] for i in range(len(features))]
    names = [features[i][0] for i in range(len(features))]
    rank = [features[i][1] for i in range(len(features))]
    aspect_embedding = [features[i][7] for i in range(len(features))]

    def find(u):
        p = parent[u]
        while p != parent[p]:
            parent[p] = parent[parent[p]]
            p = parent[p]
        return p

    def union(u, v):
        p1 = find(u)
        p2 = find(v)

        if p1 == p2:
            return
        
        emb1 = aspect_embedding[p1]
        emb2 = aspect_embedding[p2]
        
        similarity = cosine_similarity([emb1], [emb2])[0][0]

        if similarity >= 0.40:
            if rank[p1] > rank[p2]:
                parent[p2] = p1
            else:
                parent[p1] = p2

    for u in range(len(features) - 1):
        for v in range(len(features)):
            union(u, v)

    result = []
    parent = set(parent)
    for i, feature in enumerate(features):
        if i in parent:
            result.append((feature[0], feature[1], feature[2], feature[3], feature[4], feature[5]))

    return result

In [None]:
def find_top_features(graph):
    def find(u):
        p = parent[u]
        while p != parent[p]:
            parent[p] = parent[parent[p]]
            p = parent[p]
        return p

    def union(u, v):
        
        p1 = find(u)
        p2 = find(v)

        if p1 == p2:
            return

        p1_name = graph[p1][0].lower()
        p2_name = graph[p2][0].lower()
        
        emb1 = aspect_embedding[p1]
        emb2 = aspect_embedding[p2]
        
        similarity = cosine_similarity([emb1], [emb2])[0][0]
        
        sent_emb1 = node_embedding[p1]
        sent_emb2 = node_embedding[p2]
        
        sent_similarity = cosine_similarity([sent_emb1], [sent_emb2])[0][0]

        if similarity >= SIMILARITY and sent_similarity >= SENTENCE_SIMILARITY:
            if rank[p1] == rank[p2]:
                len1 = len(p1_name)
                len2 = len(p2_name)
                
                if len1 <= len2:
                    parent[p2] = p1
                    rank[p1] += rank[p2]
                    node_reviews[p1].update(node_reviews[p2])
                    other_names[p1].update(other_names[p2])
                else:
                    parent[p1] = p2
                    rank[p2] += rank[p1]
                    node_reviews[p2].update(node_reviews[p1])
                    other_names[p2].update(other_names[p1])
                
            elif rank[p1] > rank[p2]:
                parent[p2] = p1
                rank[p1] += rank[p2]
                node_reviews[p1].update(node_reviews[p2])
                other_names[p1].update(other_names[p2])
            else:
                parent[p1] = p2
                rank[p2] += rank[p1]
                node_reviews[p2].update(node_reviews[p1])
                other_names[p2].update(other_names[p1])

    OPT_FEATURES = len(graph) # min(len(graph), MAX_FEATURES)

    parent = [i for i in range(OPT_FEATURES)]
    rank = [node[1] for node in graph[:OPT_FEATURES]]
    node_name = [node[0] for node in graph[:OPT_FEATURES]]
    node_embedding = [node[2] for node in graph[:OPT_FEATURES]]
    
    node_reviews = [node[3] for node in graph[:OPT_FEATURES]]
    other_names = [set([f"{node[0]}({node[1]})"]) for node in graph[:OPT_FEATURES]]
    # other_names_with_count = {node: [node[0], node[1]] for node in graph[:OPT_FEATURES]}
    feature_ids = [node[4] for node in graph[:OPT_FEATURES]]
    quotes = [node[5] for node in graph[:OPT_FEATURES]]
    aspect_embedding = [node[6] for node in graph[:OPT_FEATURES]]

    
    for u in range(OPT_FEATURES - 1):
        for v in range(u + 1, OPT_FEATURES):
            union(u, v)
    
    results = []
    for p in set(parent):
        # print(f"Topic: {p}, {graph[p][0]} ({rank[p]})")
        results.append((graph[p][0], rank[p], node_reviews[p], other_names[p], feature_ids[p], quotes[p], p, aspect_embedding[p]))

    results.sort(key = lambda x:x[2], reverse=True)
    return results

In [None]:
sampled_aspects['aspect_case_converted'] = sampled_aspects['Aspect'].str.lower()

aspect_embeddings = sentence_embedding_model.encode(
    sampled_aspects['aspect_case_converted'].tolist(),
    batch_size=192,
    device=0,
    show_progress_bar=False,
)
# print(type(aspect_embeddings))
sampled_aspects['aspect_embeddings'] = aspect_embeddings.tolist()

In [None]:
sampled_aspects['aspect_embeddings'].head(1)

In [None]:
family_ids = sampled_aspects["ProductFamilyId"].unique()

In [None]:
all_results = {}

for family_id in family_ids:
    pf1 = sampled_aspects.loc[sampled_aspects["ProductFamilyId"] == family_id]
    pos_results = []
    neg_results = []
    results = []
    
    pos_features = []
    neg_features = []
    
    for sentiment in ["Positive", "Negative"]:
        pf1_pos = pf1.loc[pf1["Sentiment"] == sentiment]

        feature_dict = defaultdict(int)
        sentenceid_dict = defaultdict(list)
        reviewids_dict = defaultdict(set)
        representative_quote_dict = defaultdict(list)
        aspect_emb_dict = defaultdict(list)
        feature_unique_ids = defaultdict(str)
        uid = 0

        for i, row in pf1_pos.iterrows():
            feature = row["Aspect"]
            sentence_id = row["AspectId"]
            representative_quote = row["RepresentativeSentence"]

            if feature not in feature_unique_ids:
                unique_id = str(family_id) + "_" + str(sentiment) + "_" + str(uid)
                feature_unique_ids[feature] = unique_id
                uid += 1

            if sentence_id not in sentenceid_dict[feature]:
                sentenceid_dict[feature].append(sentence_id)
            reviewids_dict[feature].add(sentence_id.split("_")[0])
            feature_dict[feature] += 1
            representative_quote_dict[feature].append(representative_quote.lower() if representative_quote is not None else "")
            aspect_emb_dict[feature] = row['aspect_embeddings']

        graph = []
        threshold = MAX_FEATURES
        for k in sorted(feature_dict, key=feature_dict.get, reverse=True):
            reviewids = reviewids_dict[k]

            feature_embedding = get_mean_sentence_embedding(representative_quote_dict[k])
            graph.append([k, feature_dict[k], feature_embedding, reviewids, feature_unique_ids[k], representative_quote_dict[k], aspect_emb_dict[k]])

            if threshold < 0:
                break
            threshold -= 1
        
        if sentiment == "Positive":
            pos_features = find_top_features(graph)
        else:
            neg_features = find_top_features(graph)

    pos_neg_features = pos_features + neg_features
    if len(pos_neg_features) > 0 and len(neg_features) > 0:
        final_features = remove_duplicates(pos_neg_features)
    else:
        final_features = pos_neg_features
    all_results[family_id] = final_features
    

In [None]:
# Store the results
# import pickle

# with open('../data/results-topic-modeling-revieweaver.pkl', 'wb') as f:
#     pickle.dump(all_results, f)