Check for GPU to be found by torch and cuda drivers

In [1]:
import torch
#check wether gpu is recognized and show info
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")
print(torch.__version__)
print(torch.version.cuda)
print("cuDNN Version:", torch.backends.cudnn.version())
torch.cuda.empty_cache()

Import libraries

In [2]:
import pandas as pd
import os
import numpy as np
import sqlite3
import json
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean

Connect to the db to extract the content

In [4]:
conn2 = sqlite3.connect(r"C:\Users\20232788\Desktop\DBL-1\tweets.db")
cursor2 = conn2.cursor()
cursor2.execute("SELECT id_str, norm_tweets FROM emb_tweet_v2")

# Fetch all the results
conv_json = cursor2.fetchall()

In [5]:
tweets_df = pd.DataFrame(conv_json, columns=['id_str', 'text'])

In [5]:
split = tweets_df.shape[0] // 2
df_split_final = tweets_df[3500000:]

Sentence Transformer Embeddings

In [11]:
#TEST FOR GPU ACTIVATION
# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == "__main__":
    # Define the model
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')

    
    batch_embeddings = []
    for text in tqdm(tweets_df['text'][:500], desc="Encoding Tweets"):
        embedding = model.encode([text], device='cuda')
        batch_embeddings.append(embedding[0])

In [23]:
#Embedding process (up to 6h+ depending on the capacity of your computer)
if __name__ == "__main__":
    
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')

    
    save_dir = '/mnt/c/Users/20232788/Desktop/DBL-1/sentence_transformer_embeddings'
    os.makedirs(save_dir, exist_ok=True)

    # Initialize an empty dictionary to store the embeddings
    embeddings_dict = {}

    
    counter = 0

    
    for index, row in tqdm(df_split_final.iterrows(), total=df_split_final.shape[0], desc="Encoding Tweets"):
        text = row['text']
        tweet_id = row['id_str']  # Assuming 'id' is the column with the tweet IDs

        embedding = model.encode([text], device='cuda')

        
        embeddings_dict[tweet_id] = embedding[0]

        
        counter += 1

        
        if counter % 10000 == 0:
            np.savez(os.path.join(save_dir, f'embeddings_{counter+3490001}-{counter+3500000}.npz'), **embeddings_dict)

           
            embeddings_dict.clear()

    
    if embeddings_dict:
        np.savez(os.path.join(save_dir, f'embeddings_{counter-counter%10000+1}-{counter}.npz'), **embeddings_dict)


In [25]:
len(tweets_df)

In [24]:
len(embeddings_dict)

In [26]:
np.savez(os.path.join(save_dir, f'embeddings_441001-4413045.npz'), **embeddings_dict)

Test load one file and convert it to df

In [6]:
npz_dir = '/mnt/c/Users/20232788/Desktop/DBL-1/sentence_transformer_embeddings'
filename = 'embeddings_10001-20000.npz'
file_embeddings_dict = np.load(os.path.join(npz_dir, filename), allow_pickle=True)


In [7]:
keys = file_embeddings_dict.files
embeddings = [file_embeddings_dict[key] for key in keys]

emb_1 = pd.DataFrame({'id_str': keys, 'embedding': embeddings})

In [8]:
emb_1.info()

Function from directory to list of npz formatted dictionaries

In [6]:
npz_dir = r'C:\Users\20232788\Desktop\DBL-1\sentence_transformer_embeddings'
# Define a function to load the embeddings from a single npz file
def load_embeddings(filename):
    if filename.endswith('.npz'):
        # Load the embeddings from the npz file
        file_embeddings_dict = np.load(os.path.join(npz_dir, filename), allow_pickle=True)
        return file_embeddings_dict
    else:
        return {}
# Use a ThreadPoolExecutor to parallelize the loading of the embeddings
filenames = os.listdir(npz_dir)

with ThreadPoolExecutor() as executor:
    npz_files = list(tqdm(executor.map(load_embeddings, filenames), total=len(filenames), desc="Loading embeddings"))

In [14]:
npz_files

From npz formatted dictionaries to one single dict

In [7]:
data_dict = {}

for npz_file in tqdm(npz_files, desc="Processing files", unit="file"):
   
    keys = npz_file.files
    embeddings = [npz_file[key] for key in keys]
    
    for key, embedding in zip(keys, embeddings):
        data_dict[key] = embedding

np.savez(os.path.join(npz_dir, 'combined_embeddings.npz'), **data_dict)

In [1]:
#add the embeddings to the original df and store it locally for easier later usage
tweets_df['embedding'] = tweets_df['id_str'].map(data_dict)
tweets_df.to_pickle(r'C:\Users\20232788\Desktop\DBL-1\complete_sentences_emb\tweets_df.pkl')

In [6]:
if tweets_df.isnull().values.any():
    print("DataFrame contains null values.")
else:
    print("DataFrame does not contain null values.")

<span style="font-size:2.5em;">TOPIC MODELLING</span>

In [10]:
tweets_df = pd.read_pickle(r'C:\Users\20232788\Desktop\DBL-1\complete_sentences_emb\tweets_df.pkl')

In [4]:
tweets_df.head()

In [11]:
#split df in half for easier computations (my ram cannot handle 4.4M all at the same time :( ) 
half = len(tweets_df) // 2
tweets_df_fh = tweets_df.iloc[:half, :]
tweets_df_sh = tweets_df.iloc[half:, :]

In [12]:
emb_array_sh = tweets_df_sh['embedding'].to_numpy()
emb_array_fh = tweets_df_fh['embedding'].to_numpy()

In [7]:
emb_array_fh.shape

In [13]:
#convert flattened embeddings ndarray
sub_array_length = 384

emb_array_sh = np.array([np.array(sub_array) for sub_array in emb_array_sh])
emb_array_fh = np.array([np.array(sub_array) for sub_array in emb_array_fh])

print([emb_array_sh.shape, emb_array_fh.shape])

In [24]:
del tweets_df

Load the models (repeat topic computation 2 times one for each half)

In [8]:
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric='cosine', random_state=42, low_memory=True)

In [10]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [11]:
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

In [4]:
if __name__ == "__main__":
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')

In [13]:
topic_model_umap = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    min_topic_size=100
)

# Fit the BERTopic model to the agnostic comments and their embeddings
topics_d, probs_d = topic_model_umap.fit_transform(tweets_df_sh['text'], emb_array_sh)

In [14]:
#save the topic model for later use
topic_model_umap.save(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap_2.pkl", serialization="pytorch", save_ctfidf=True, save_embedding_model=model)

load topic models and clean topics

In [4]:
topic_model_umap_fh = BERTopic.load(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap.pkl", embedding_model=model)
topic_model_umap_sh = BERTopic.load(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap_2.pkl", embedding_model=model)

In [2]:
#topic_model_umap_sh.visualize_topics()

In [42]:
topic_model_umap_sh.get_topic_info()

Merge topics by distance! explanation of the process: 
get the topics and their word list, store it in a dictionary.
The topics also contain the probabilities for each word (not needed atm so erase them), flattend the lists to a single entry. 
Compute vector embeddings of all of the entries, calculate the distance (euclidean) and compute mean and std. 
Set threshold variable, create adjacency lists of the similar topics, simpliy the structure of the lists and merge the topics! 
Repeat the process until the wanted amount of topics is reached

In [61]:
num_topics = len(topic_model_umap_fh_cl.topics_)
topic_dicts_fh = {}

for i in range(num_topics):
    topic_dicts_fh[i] = topic_model_umap_fh_cl.get_topic(i)

rep_dict_fh = {k: [item[0] for item in v] for k, v in topic_dicts_fh.items() if isinstance(v, list)}

In [62]:
num_topics = len(topic_model_umap_sh_cl.topics_)
topic_dicts_sh = {}

for i in range(num_topics):
    topic_dicts_sh[i] = topic_model_umap_sh_cl.get_topic(i)

rep_dict_sh = {k: [item[0] for item in v] for k, v in topic_dicts_sh.items() if isinstance(v, list)}

In [63]:
key_vectors_fh = {}
for k, words in rep_dict_fh.items():
    sentence = " ".join(words)
    vector = model.encode(sentence, device='cuda')
    key_vectors_fh[k] = vector

# Calculate Euclidean distance between sentence embeddings for each pair of keys
distances_fh = {}
for k1, v1 in key_vectors_fh.items():
    for k2, v2 in key_vectors_fh.items():
        if k1 != k2:
            distance = euclidean(v1, v2)
            distances_fh[(k1, k2)] = distance

In [64]:
key_vectors_sh = {}
for k, words in rep_dict_sh.items():
    sentence = " ".join(words)
    vector = model.encode(sentence, device='cuda')
    key_vectors_sh[k] = vector

# Calculate Euclidean distance between sentence embeddings for each pair of keys
distances_sh = {}
for k1, v1 in key_vectors_sh.items():
    for k2, v2 in key_vectors_sh.items():
        if k1 != k2:
            distance = euclidean(v1, v2)
            distances_sh[(k1, k2)] = distance

In [65]:
mean_distance_fh = np.mean(list(distances_fh.values()))
std_distance_fh = np.std(list(distances_fh.values()))
mean_distance_sh = np.mean(list(distances_sh.values()))
std_distance_sh = np.std(list(distances_sh.values()))
print(mean_distance_fh, std_distance_fh)
print(mean_distance_sh, std_distance_sh)

In [66]:
threshold_fh = mean_distance_fh - 1.5 * std_distance_fh
threshold_sh = mean_distance_sh - 1.5 * std_distance_sh

In [67]:
groups_fh = []

# Iterate over each item in the dictionary
for key, value in distances_fh.items():
    # If the value is below the threshold
    if value < threshold_fh:
        # If this is the first key or it doesn't share a number with the last key in the last group
        if not groups_fh or not (key[0] == groups_fh[-1][-1][1] or key[1] == groups_fh[-1][-1][0]):
            # Start a new group with the key
            groups_fh.append([key])
        else:
            # Otherwise, add the key to the last group
            groups_fh[-1].append(key)

In [68]:
# Initialize a set to keep track of the second numbers that have been used
used_second_numbers = set()

# Iterate over the groups
for group in groups_fh:
    # Iterate over the keys in the group
    for i in range(len(group)):
        # If the second number of the key is in the used numbers set, remove the key from the group
        if group[i][1] in used_second_numbers:
            group.pop(i)
        else:
            # If the second number of the key is not in the used numbers set and it is less than 10, add it to the set
            if group[i][1] < 10:
                used_second_numbers.add(group[i][1])

In [69]:
groups_sh = []

# Iterate over each item in the dictionary
for key, value in distances_sh.items():
    # If the value is below the threshold
    if value < threshold_sh:
        # If this is the first key or it doesn't share a number with the last key in the last group
        if not groups_sh or not (key[0] == groups_sh[-1][-1][1] or key[1] == groups_sh[-1][-1][0]):
            # Start a new group with the key
            groups_sh.append([key])
        else:
            # Otherwise, add the key to the last group
            groups_sh[-1].append(key)

In [70]:
# Initialize a set to keep track of the second numbers that have been used
used_second_numbers = set()

# Iterate over the groups
for group in groups_sh:
    # Iterate over the keys in the group
    for i in range(len(group)):
        # If the second number of the key is in the used numbers set, remove the key from the group
        if group[i][1] in used_second_numbers:
            group.pop(i)
        else:
            # If the second number of the key is not in the used numbers set and it is less than 10, add it to the set
            if group[i][1] < 10:
                used_second_numbers.add(group[i][1])

In [71]:
def replace_numbers(lst):
    replacements = {}
    for sublist in lst:
        for i in range(len(sublist)):
            left, right = sublist[i]
            if left in replacements:
                sublist[i] = (replacements[left], right)
            else:
                replacements[right] = left
    return lst

In [72]:
replace_numbers(groups_fh)
groups_fh = [[lst[0]] for lst in groups_fh if lst]
groups_fh = [list(tup) for sublist in groups_fh for tup in sublist]

In [73]:
replace_numbers(groups_sh)
groups_sh = [[lst[0]] for lst in groups_sh if lst]
groups_sh = [list(tup) for sublist in groups_sh for tup in sublist]

In [74]:
numbers_fh = [lst[0] for lst in groups_fh]
numbers_sh = [lst[0] for lst in groups_sh]

un_num_fh = len(set(numbers_fh))
un_num_sh = len(set(numbers_sh))

print(f'unique num first half {un_num_fh}')
print(f'unique num second half {un_num_sh}')

In [75]:
topic_model_umap_fh_cl.merge_topics(tweets_df_fh['text'], groups_fh)
topic_model_umap_sh_cl.merge_topics(tweets_df_sh['text'], groups_sh)

In [58]:
topic_model_umap_sh_cl.get_topic_info()

In [59]:
topic_model_umap_fh_cl.get_topic_info()

Manually merge the last remaining topics if some are still similar

In [87]:
topic_model_umap_fh_cl.merge_topics(tweets_df_fh['text'], [5, 6])
topic_model_umap_sh_cl.merge_topics(tweets_df_sh['text'], [2, 6])

In [82]:
topic_model_umap_fh_cl.get_topic_info()

In [88]:
topic_model_umap_sh_cl.get_topic_info()

In [90]:
topic_model_umap_fh_cl.set_topic_labels({-1: "Customer Services", 0: "Flight Cancellation/Refounds", 1:'Inclusivity/Racism', 2:'Destination', 3:'Food', 4:'Online Booking', 5:'Onboard Services'})
topic_model_umap_sh_cl.set_topic_labels({-1: "Customer Services", 0: "Flight Cancellation/Refounds", 1:'Easyjet Franchise', 2:'Food', 3:'Onboard Services', 4:'Oguna/Lufthansa scandal', 5:'Health Issues'})

In [91]:
topic_model_umap_fh_cl.visualize_barchart(custom_labels=True)

In [92]:
topic_model_umap_sh_cl.visualize_barchart(custom_labels=True)

Save the models and set custom labels in the dfs

In [93]:
topic_model_umap_fh_cl.save(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap_fh_cl.pkl", serialization="pytorch", save_ctfidf=True, save_embedding_model=model)
topic_model_umap_sh_cl.save(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap_sh_cl.pkl", serialization="pytorch", save_ctfidf=True, save_embedding_model=model)

In [99]:
tweets_df_fh['topic'] = topic_model_umap_fh_cl.topics_
topic_names = {-1: "Customer Services", 0: "Flight Cancellation/Refounds", 1:'Inclusivity/Racism', 2:'Destination', 3:'Food', 4:'Online Booking', 5:'Onboard Services'}
tweets_df_fh['topic'] = tweets_df_fh['topic'].map(topic_names)

In [100]:
tweets_df_sh['topic'] = topic_model_umap_sh_cl.topics_
topic_names = {-1: "Customer Services", 0: "Flight Cancellation/Refounds", 1:'Easyjet Franchise', 2:'Food', 3:'Onboard Services', 4:'Oguna/Lufthansa scandal', 5:'Health Issues'}
tweets_df_sh['topic'] = tweets_df_sh['topic'].map(topic_names)

In [2]:
#merge back the two dfs into the original one
tweets_df = pd.concat([tweets_df_fh, tweets_df_sh])

In [109]:
tweets_df.head()

Store both dfs (one with embeddings the other without locally)

In [3]:
tweets_df2 = tweets_df[['id_str', 'text', 'topic']]

In [4]:
tweets_df.to_pickle(r'C:\Users\20232788\Desktop\DBL-1\complete_sentences_emb\tweets_df.pkl')

In [5]:
tweets_df2.to_pickle(r'C:\Users\20232788\Desktop\DBL-1\complete_sentences_emb\tweets_df2.pkl')

In [5]:
topic_model_umap_fh_cl = BERTopic.load(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap_fh_cl.pkl", embedding_model=model)
topic_model_umap_sh_cl = BERTopic.load(r"C:\Users\20232788\Desktop\DBL-1\topic_models_bertopic\topic_model_umap_sh_cl.pkl", embedding_model=model)

In [6]:
from scipy.special import rel_entr

def kl_divergence(p, q):
    """Calculate the Kullback-Leibler Divergence between two distributions.
    
    Args:
        p (np.array): The first probability distribution (topic distribution).
        q (np.array): The second probability distribution (uniform distribution).
        
    Returns:
        float: The Kullback-Leibler Divergence.
    """
    return np.sum(rel_entr(p, q))

def get_uniform_distribution(vocab_size):
    """Get a uniform distribution for a given vocabulary size.
    
    Args:
        vocab_size (int): The size of the vocabulary.
        
    Returns:
        np.array: The uniform distribution.
    """
    return np.full(vocab_size, 1/vocab_size)

def get_topic_distribution(topic_model, topic_id):
    """Get the word distribution for a given topic.
    
    Args:
        topic_model: The topic model object.
        topic_id (int): The topic id.
        
    Returns:
        np.array: The word distribution for the topic, or None if the topic doesn't exist.
    """
    topic = topic_model.get_topic(topic_id)
    if topic:
        return np.array([weight for _, weight in topic])
    else:
        return None

def calculate_kld_for_all_topics(topic_model):
    """Calculate KLD scores for all topics in the topic model.
    
    Args:
        topic_model: The topic model object.
        
    Returns:
        dict: A dictionary with topic ids as keys and KLD scores as values.
    """
    topic_ids = range(len(topic_model.get_topics()))
    vocab_size = len(topic_model.get_topic(0))
    uniform_distribution = get_uniform_distribution(vocab_size)
    
    kld_scores = {}
    for topic_id in topic_ids:
        topic_distribution = get_topic_distribution(topic_model, topic_id)
        if topic_distribution is not None:
            kld_score = kl_divergence(topic_distribution, uniform_distribution)
            kld_scores[topic_id] = kld_score
    
    return kld_scores

In [22]:
def plot_kld_scores(kld_scores):
    """Plot KLD scores for all topics except the first one.
    
    Args:
        kld_scores (dict): A dictionary with topic ids as keys and KLD scores as values.
    """
    topics = list(kld_scores.keys())
    scores = list(kld_scores.values())
    
    plt.figure(figsize=(12, 8))
    plt.bar(topics, scores, color=plt.cm.cool(np.linspace(0, 1, len(scores))))
    plt.xlabel('Topic ID')
    plt.ylabel('KLD Score')
    plt.title('KLD Scores for All Topics')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(topics)
    plt.tight_layout()
    plt.show()

In [21]:
kld_scores_topic_model_umap_fh_cl = calculate_kld_for_all_topics(topic_model_umap_fh_cl)
plot_kld_scores(kld_scores_topic_model_umap_fh_cl)

In [23]:
kld_scores_topic_model_umap_sh_cl = calculate_kld_for_all_topics(topic_model_umap_sh_cl)
plot_kld_scores(kld_scores_topic_model_umap_sh_cl)

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_embeddings_fh = pca.fit_transform(emb_array_fh)
reduced_embeddings_sh = pca.fit_transform(emb_array_sh)

In [16]:
fig = topic_model_umap_fh_cl.visualize_documents(tweets_df_fh['text'], reduced_embeddings=reduced_embeddings_fh)
fig.write_html(r"C:\Users\20232788\Desktop\DBL-1\topics_image_fh.html")