# import packages

In [1]:
import pandas as pd 
pd.set_option('display.max_colwidth', None)
import numpy as np
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.metrics import pairwise_distances
import math
from math import log
from scipy.spatial.distance import cdist

2023-05-30 14:26:18.098861: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# Coherence

In [2]:
def Coherence_score_topic(embedding, topic):
    
    #group embeddings based on topics (exclude embeddings from topic -1, which have been classified as outliers)
    emb_dict = {}
    for i, t in enumerate(topic):
        if t != -1:
            if t not in emb_dict:
                emb_dict[t] = [embedding[i]]
            else:
                emb_dict[t].append(embedding[i])
    
    # the number of topics
    topic_number = len(emb_dict)
    
    #calculate coherence score for each cluster
    coherence_array = []
    
    for i in range(topic_number):
        cluster_emb = emb_dict[i]
        
        #get the number of embeddings within one cluster
        num_emb = np.shape(cluster_emb)[0]
        
        #normalize embeddings
        E = []
        for emb in cluster_emb[0:num_emb]:
            normalized_emb = emb/emb.sum()
            E.append(normalized_emb)
        
        result = 0.0
        if len(E) > 0:
            E = np.array(E)
            # Perform cosine similarity between E rows
            distances = np.sum(1 - pairwise_distances(E, metric='cosine') - np.diag(np.ones(len(E))))
            result = distances / (num_emb*(num_emb-1))
        else:
            result = -1
        
        coherence_array.append(result)
    
    avg_coherence = np.mean(coherence_array)

    return avg_coherence


# K-L Divergency

In [3]:
def KL_Divergence_topic(embedding, topic):
    
    #calculate the average embedding for each cluster
    avg_emb_dict = {}
    for i, t in enumerate(topic):
        if t != -1:
            if t not in avg_emb_dict:
                avg_emb_dict[t] = [embedding[i]]
            else:
                avg_emb_dict[t].append(embedding[i])
    for t in avg_emb_dict:
        avg_emb_dict[t] = np.mean(avg_emb_dict[t], axis=0)
        
    topic_number = len(avg_emb_dict)
    array = []
    for i in range(topic_number):
        array.append(avg_emb_dict[i])

    #normalize embeddings as probability
    prob_embedding = []
    for emb in array[0:topic_number]:
        normalized_emb = np.exp(emb) / np.sum(np.exp(emb), axis=0)
        prob_embedding.append(normalized_emb)
    
    result = 0
    for emb_1 in prob_embedding[0:topic_number]:
        for emb_2 in prob_embedding[0:topic_number]:
            div_results = sum(emb_1[i]*log(emb_1[i]/emb_2[i]) for i in range(len(emb_1)))
            result += div_results
    
    result = result / 2*topic_number*(topic_number-1)
    return result

# Load reviews

In [4]:
CS_Reviews_Final = pd.read_csv('Reviews_complexity.csv').copy().drop(['Unnamed: 0'], axis=1)
CS_Reviews_Final

Unnamed: 0,title,status,pros,cons,review,rating,processed_review,wo_stop,lemmatized,total_word_num,raw_word_count,num_unique_word,entropy
0,Dream job,Software Engineer(Current Employee),,,"This is one of the best place to work in the world. I couldn't ask for more. The culture is amazing, the benefits are unbeatable, well that's Google, I'm blessed to be working here",5.0,this is one of the best place to work in the world i couldnt ask for more the culture is amazing the benefits are unbeatable well thats it im blessed to be working here,one best place work world couldnt ask culture amazing benefits unbeatable well thats im blessed working,one good place work world could not ask culture amazing benefit unbeatable well that s I m bless work,19,34,18,0.849380
1,"Worked as a contractor software engineer, work time flexibility was great but worker empowerment was low for contractors.",Software Engineer Contractor(Former Employee),,,"Worked as a contractor software engineer, work time flexibility was great but worker empowerment was low for contractors. Contractors are treated as second-class citizens within Google.",5.0,worked as a contractor software engineer work time flexibility was great but worker empowerment was low for contractors contractors are treated as secondclass citizens within it,worked contractor software engineer work time flexibility great worker empowerment low contractors contractors treated secondclass citizens within,work contractor software engineer work time flexibility great worker empowerment low contractor contractor treat secondclass citizen within,17,26,14,0.679704
2,Productive and fun workplace,Software Engineer(Former Employee),,,It was a fun and interesting experience. Lots of handson and i truly learn a lot. It was an invaluable experience working with my team and i have grown,5.0,it was a fun and interesting experience lots of handson and i truly learn a lot it was an invaluable experience working with my team and i have grown,fun interesting experience lots handson truly learn lot invaluable experience working team grown,fun interesting experience lot handson truly learn lot invaluable experience work team grow,13,29,11,0.589490
3,Great company to work for!,Quality Assurance Specialist(Current Employee),,,One of the best companies to work for overall. Very stable company and provides opportunities for career advancement. The culture is great too where they help you to succeed in your role.,5.0,one of the best companies to work for overall very stable company and provides opportunities for career advancement the culture is great too where they help you to succeed in your role,one best companies work overall stable company provides opportunities career advancement culture great help succeed role,one good company work overall stable company provide opportunity career advancement culture great help succeed role,16,32,15,0.818592
4,Best Workplace,Software Engineer(Current Employee),,,"Google is a great work place. They pay well and respect their employees. Team change is flexible, your manager can stop your moving if your tenure in the current team is more than a year. You can make impact doing what interests you. This makes Google the best place to work",5.0,it is a great work place they pay well and respect their employees team change is flexible your manager can stop your moving if your tenure in the current team is more than a year you can make impact doing what interests you this makes it the best place to work,great work place pay well respect employees team change flexible manager stop moving tenure current team year make impact interests makes best place work,great work place pay well respect employee team change flexible manager stop move tenure current team year make impact interest make good place work,24,51,20,1.214855
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7056,Ibm,Delivery Architect(Current Employee),,,"I’ve been with IBM for many years of my career in some facet. During every step of it IBM has done layoffs and resource actions and hour reductions. We all knew that come March it was coming. However, it has started coming more and more. It is an inconsistent and unstable place to ensure you have constant work.",1.0,ive been with it for many years of my career in some facet during every step of it it has done layoffs and resource actions and hour reductions we all knew that come march it was coming however it has started coming more and more it is an inconsistent and unstable place to ensure you have constant work,ive many years career facet every step done layoffs resource actions hour reductions knew come march coming however started coming inconsistent unstable place ensure constant work,I ve many year career facet every step do layoff resource action hour reduction know come march come however start come inconsistent unstable place ensure constant work,27,58,25,0.602547
7057,5 stars,Sr. Technical Program Manager(Current Employee),,,"What is the best part of working at the company?The cultureWhat is the most stressful part about working at the company?None identifiedWhat is the work environment and culture like at the company?Warm, inviting, welcoming",5.0,what is the best part of working at the companythe culturewhat is the most stressful part about working at the companynone identifiedwhat is the work environment and culture like at the companywarm inviting welcoming,best part working companythe culturewhat stressful part working companynone identifiedwhat work environment culture like companywarm inviting welcoming,good part work companythe culturewhat stressful part work companynone identifiedwhat work environment culture like companywarm invite welcoming,17,34,14,0.922879
7058,Great company,Software Engineer(Current Employee),,,LinkedIn's food is great. There is a InDay every month. There are a lot to learn. The culture is transparent. The food is so good. LinkedIn's office is so beautiful,5.0,its food is great there is a inday every month there are a lot to learn the culture is transparent the food is so good its office is so beautiful,food great inday every month lot learn culture transparent food good office beautiful,food great inday every month lot learn culture transparent food good office beautiful,13,30,12,0.429123
7059,"Great WLB, pay and technical challenges",Software Engineer(Current Employee),,,"Great pay, tons of time off.Food is top notch, probably among the best in call big tech companies.Management cares about your career growth and gives you ample opportunity to take ownership.",5.0,great pay tons of time offfood is top notch probably among the best in call big tech companiesmanagement cares about your career growth and gives you ample opportunity to take ownership,great pay tons time offfood top notch probably among best call big tech companiesmanagement cares career growth gives ample opportunity take ownership,great pay ton time offfood top notch probably among good call big tech companiesmanagement care career growth give ample opportunity take ownership,22,31,22,0.484396


# Proposed Pipeline

## Stage 1:  Split Dataset based on Entropy

In [5]:
CS_Reviews_Final.describe()

Unnamed: 0,rating,total_word_num,raw_word_count,num_unique_word,entropy
count,7061.0,7061.0,7061.0,7061.0,7061.0
mean,3.889676,24.493415,44.885144,21.755134,0.71007
std,1.076866,19.734688,37.260023,15.779669,0.444561
min,1.0,4.0,5.0,4.0,0.002277
25%,3.0,15.0,26.0,14.0,0.419882
50%,4.0,18.0,33.0,17.0,0.628106
75%,5.0,25.0,48.0,23.0,0.886104
max,5.0,165.0,322.0,130.0,4.531864


In [6]:
entropy_25_low_reviews = CS_Reviews_Final.loc[CS_Reviews_Final['entropy'] <= 0.419882]
entropy_25_high_reviews = CS_Reviews_Final.loc[CS_Reviews_Final['entropy'] > 0.419882]

## Stage 2: Single Topic Modeling for Entropy_25_low

In [7]:
# min-max normalization function
def min_max_norm(column):
    return (column - column.min()) / (column.max() - column.min())

entropy_docs = entropy_25_low_reviews.processed_review.to_list()
entropy_umap_model = UMAP(n_neighbors=100, n_components=16, min_dist=0.0, metric='cosine', random_state=42)
entropy_hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=25, min_samples=1, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(entropy_docs)

# Define the range of number of topics to try
NUMBER_TOPIC = 21
num_topics_range = range(1, NUMBER_TOPIC)

# Initialize empty lists to store coherence and diversity scores
num_topic_list = list(range(1, NUMBER_TOPIC))
coherence_scores = []
diversity_scores = []

# Loop through the range of number of topics
for num_topics in num_topics_range:
    
    # Fit BERTopic model with current number of topics
    model = BERTopic(umap_model=entropy_umap_model, hdbscan_model = entropy_hdbscan_model, nr_topics=num_topics)
    topics, _ = model.fit_transform(entropy_docs)
    
    # Calculate coherence score for each cluster
    c_score = Coherence_score_topic(embeddings, topics)
    coherence_scores.append(c_score)
    
    # Calculate diversity score between clusters
    d_score = KL_Divergence_topic(embeddings, topics)
    diversity_scores.append(d_score)
    
    print(f"Number of topics: {num_topics}, Coherence score: {c_score:.4f}, Diversity score: {d_score:.4f}")


metric_df = pd.DataFrame({'Number': num_topic_list, 
                          'Coherence': coherence_scores, 
                          'Diversity': diversity_scores})

# apply min-max normalization to the "Coherence" and "Diversity" columns
metric_df["adjusted_Coherence"] = min_max_norm(metric_df["Coherence"])
metric_df["log_Diversity"] = np.log(metric_df['Diversity'] + 1)
metric_df["adjusted_Diversity"] = min_max_norm(metric_df["log_Diversity"])

#calculate mean value of two adjusted metrics
metric_df['mean'] = metric_df[['adjusted_Coherence', 'adjusted_Diversity']].mean(axis=1)

metric_df

Number of topics: 1, Coherence score: 0.0069, Diversity score: 0.0000
Number of topics: 2, Coherence score: 0.0992, Diversity score: 0.0004
Number of topics: 3, Coherence score: 0.1161, Diversity score: 0.0061
Number of topics: 4, Coherence score: 0.1160, Diversity score: 0.0234
Number of topics: 5, Coherence score: 0.0966, Diversity score: 0.0661
Number of topics: 6, Coherence score: 0.1294, Diversity score: 0.1592
Number of topics: 7, Coherence score: 0.1260, Diversity score: 0.3075
Number of topics: 8, Coherence score: 0.1391, Diversity score: 0.5523
Number of topics: 9, Coherence score: 0.1452, Diversity score: 0.9729
Number of topics: 10, Coherence score: 0.1582, Diversity score: 1.5870
Number of topics: 11, Coherence score: 0.1603, Diversity score: 2.3891
Number of topics: 12, Coherence score: 0.1737, Diversity score: 3.5462
Number of topics: 13, Coherence score: 0.1846, Diversity score: 4.9697
Number of topics: 14, Coherence score: 0.1964, Diversity score: 6.9640
Number of topic

Unnamed: 0,Number,Coherence,Diversity,adjusted_Coherence,log_Diversity,adjusted_Diversity,mean
0,1,0.006929,0.0,0.0,0.0,0.0,0.0
1,2,0.09924,0.00041,0.487145,0.00041,0.000198,0.243671
2,3,0.116123,0.006051,0.576236,0.006033,0.002908,0.289572
3,4,0.115994,0.023438,0.57556,0.023168,0.011166,0.293363
4,5,0.096587,0.066075,0.473143,0.063984,0.030837,0.25199
5,6,0.129434,0.159223,0.646483,0.14775,0.071207,0.358845
6,7,0.126041,0.307468,0.628578,0.268092,0.129206,0.378892
7,8,0.139129,0.552303,0.697648,0.43974,0.21193,0.454789
8,9,0.145151,0.972894,0.729424,0.679501,0.327482,0.528453
9,10,0.158231,1.586955,0.798448,0.950481,0.45808,0.628264


## Stage 3: assign multiple topics to high entropy data

In [8]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

entropy_low_docs = entropy_25_low_reviews.processed_review.to_list()
entropy_low_embeddings = sentence_model.encode(entropy_low_docs, show_progress_bar=True)

entropy_high_docs = entropy_25_high_reviews.processed_review.to_list()
entropy_high_embeddings = sentence_model.encode(entropy_high_docs, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=56.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=166.0, style=ProgressStyle(description_widt…




In [9]:
low_reduced_emb = UMAP(n_neighbors=100, n_components=16, min_dist=0.0, metric='cosine', random_state=42).fit_transform(entropy_low_embeddings)
high_reduced_emb = UMAP(n_neighbors=100, n_components=16, min_dist=0.0, metric='cosine', random_state=42).fit_transform(entropy_high_embeddings)

In [10]:
def exemplars(cluster_id, condensed_tree):
    raw_tree = condensed_tree._raw_tree
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)
    result = np.array([])
    for leaf in leaves:
        max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        points = raw_tree['child'][(raw_tree['parent'] == leaf) &
                                   (raw_tree['lambda_val'] == max_lambda)]
        result = np.hstack((result, points))
    return result.astype(np.int)

def min_dist_to_exemplar(point, cluster_exemplars, data_low, data_high):
    dists = cdist([data_high[point]], data_low[cluster_exemplars.astype(np.int32)])
    return dists.min()

def dist_vector(point, exemplar_dict, data_low, data_high):
    result = {}
    for cluster in exemplar_dict:
        result[cluster] = min_dist_to_exemplar(point, exemplar_dict[cluster], data_low, data_high)
    return np.array(list(result.values()))

def dist_membership_vector(point, exemplar_dict, data_low, data_high, softmax=False):
    if softmax:
        result = np.exp(1./dist_vector(point, exemplar_dict, data_low, data_high))
        result[~np.isfinite(result)] = np.finfo(np.double).max
    else:
        result = 1./dist_vector(point, exemplar_dict, data_low, data_high)
        result[~np.isfinite(result)] = np.finfo(np.double).max
    result /= result.sum()
    return result

In [12]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=25, min_samples=1, metric='euclidean', cluster_selection_method='eom', prediction_data=True).fit(low_reduced_emb)
tree = clusterer.condensed_tree_
exemplar_dict = {c:exemplars(c,tree) for c in tree._select_clusters()}

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return result.astype(np.int)


In [13]:
high_membership = []
for x in range(high_reduced_emb.shape[0]):
    membership_vector = dist_membership_vector(x, exemplar_dict, low_reduced_emb, high_reduced_emb)
    high_membership.append(membership_vector)

high_membership_df = pd.DataFrame(high_membership)
high_membership_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.072363,0.071174,0.071211,0.071242,0.071425,0.071162,0.071394,0.071749,0.070107,0.070925,0.071047,0.072396,0.072059,0.071746
1,0.072334,0.071190,0.071167,0.071253,0.071396,0.071151,0.071381,0.071720,0.070114,0.070944,0.071074,0.072392,0.072093,0.071791
2,0.072309,0.071147,0.071155,0.071249,0.071366,0.071129,0.071360,0.071752,0.070132,0.070943,0.071080,0.072436,0.072131,0.071811
3,0.072382,0.071184,0.071224,0.071233,0.071446,0.071160,0.071411,0.071764,0.070091,0.070915,0.071028,0.072388,0.072036,0.071738
4,0.072364,0.071192,0.071195,0.071255,0.071438,0.071180,0.071398,0.071729,0.070084,0.070941,0.071056,0.072377,0.072049,0.071741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5291,0.072372,0.071192,0.071200,0.071238,0.071424,0.071151,0.071401,0.071744,0.070094,0.070921,0.071043,0.072393,0.072064,0.071765
5292,0.072358,0.071157,0.071196,0.071199,0.071365,0.071094,0.071416,0.071783,0.070133,0.070905,0.071032,0.072461,0.072111,0.071788
5293,0.072366,0.071178,0.071211,0.071242,0.071429,0.071164,0.071398,0.071748,0.070103,0.070926,0.071045,0.072393,0.072055,0.071742
5294,0.072381,0.071200,0.071193,0.071244,0.071443,0.071164,0.071412,0.071744,0.070065,0.070931,0.071040,0.072384,0.072047,0.071753


In [14]:
thresholds = high_membership_df.quantile(0.25)

for i, threshold in enumerate(thresholds):
    new_col_name = f'topic_{i}'
    high_membership_df[new_col_name] = np.where(high_membership_df.iloc[:, i] <= threshold, f'topic_{i}', 0)

high_topics_review_w_prob = pd.concat([entropy_25_high_reviews.reset_index(drop=True), high_membership_df], axis=1)
high_topics_review_w_prob.to_csv("high_entropy_multi_q_25.csv")
high_membership_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13
0,0.072363,0.071174,0.071211,0.071242,0.071425,0.071162,0.071394,0.071749,0.070107,0.070925,...,0,0,0,0,0,0,0,0,0,topic_13
1,0.072334,0.071190,0.071167,0.071253,0.071396,0.071151,0.071381,0.071720,0.070114,0.070944,...,0,0,topic_6,topic_7,0,0,0,0,0,0
2,0.072309,0.071147,0.071155,0.071249,0.071366,0.071129,0.071360,0.071752,0.070132,0.070943,...,topic_4,0,topic_6,0,0,0,0,0,0,0
3,0.072382,0.071184,0.071224,0.071233,0.071446,0.071160,0.071411,0.071764,0.070091,0.070915,...,0,0,0,0,0,topic_9,topic_10,topic_11,topic_12,topic_13
4,0.072364,0.071192,0.071195,0.071255,0.071438,0.071180,0.071398,0.071729,0.070084,0.070941,...,0,0,0,topic_7,0,0,0,topic_11,topic_12,topic_13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5291,0.072372,0.071192,0.071200,0.071238,0.071424,0.071151,0.071401,0.071744,0.070094,0.070921,...,0,0,0,0,0,0,0,0,0,0
5292,0.072358,0.071157,0.071196,0.071199,0.071365,0.071094,0.071416,0.071783,0.070133,0.070905,...,topic_4,topic_5,0,0,0,topic_9,0,0,0,0
5293,0.072366,0.071178,0.071211,0.071242,0.071429,0.071164,0.071398,0.071748,0.070103,0.070926,...,0,0,0,0,0,0,0,0,0,topic_13
5294,0.072381,0.071200,0.071193,0.071244,0.071443,0.071164,0.071412,0.071744,0.070065,0.070931,...,0,0,0,0,topic_8,0,0,topic_11,topic_12,0


In [15]:
thresholds = high_membership_df.quantile(0.5)

for i, threshold in enumerate(thresholds):
    new_col_name = f'topic_{i}'
    high_membership_df[new_col_name] = np.where(high_membership_df.iloc[:, i] <= threshold, f'topic_{i}', 0)

high_topics_review_w_prob = pd.concat([entropy_25_high_reviews.reset_index(drop=True), high_membership_df], axis=1)
high_topics_review_w_prob.to_csv("high_entropy_multi_q_5.csv")
high_membership_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13
0,0.072363,0.071174,0.071211,0.071242,0.071425,0.071162,0.071394,0.071749,0.070107,0.070925,...,0,0,topic_6,topic_7,0,topic_9,topic_10,topic_11,topic_12,topic_13
1,0.072334,0.071190,0.071167,0.071253,0.071396,0.071151,0.071381,0.071720,0.070114,0.070944,...,topic_4,0,topic_6,topic_7,0,0,0,topic_11,0,0
2,0.072309,0.071147,0.071155,0.071249,0.071366,0.071129,0.071360,0.071752,0.070132,0.070943,...,topic_4,topic_5,topic_6,topic_7,0,0,0,0,0,0
3,0.072382,0.071184,0.071224,0.071233,0.071446,0.071160,0.071411,0.071764,0.070091,0.070915,...,0,0,0,0,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13
4,0.072364,0.071192,0.071195,0.071255,0.071438,0.071180,0.071398,0.071729,0.070084,0.070941,...,0,0,topic_6,topic_7,topic_8,0,0,topic_11,topic_12,topic_13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5291,0.072372,0.071192,0.071200,0.071238,0.071424,0.071151,0.071401,0.071744,0.070094,0.070921,...,0,0,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,0
5292,0.072358,0.071157,0.071196,0.071199,0.071365,0.071094,0.071416,0.071783,0.070133,0.070905,...,topic_4,topic_5,0,0,0,topic_9,topic_10,0,0,0
5293,0.072366,0.071178,0.071211,0.071242,0.071429,0.071164,0.071398,0.071748,0.070103,0.070926,...,0,0,topic_6,topic_7,0,topic_9,topic_10,topic_11,topic_12,topic_13
5294,0.072381,0.071200,0.071193,0.071244,0.071443,0.071164,0.071412,0.071744,0.070065,0.070931,...,0,0,0,topic_7,topic_8,0,topic_10,topic_11,topic_12,topic_13
