In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import os
import sys
import h5py
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, TopicDiversity
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

  from collections import Iterable


# Loading Data

In [None]:
embeddings_path = "/data/processed/embeddings"
topics_saving_path = "/data/processed/CTM/topics"
model_saving_path = '/models/ctm_models'
data_saving_path = "/data/processed/CTM" 

In [None]:
world_anti_tweets = pd.read_parquet("/data/raw/world_anti_tweets_and_ids.parquet")
world_anti_tweets

In [None]:
embedding_cache_size = sys.getsizeof(np.random.random((64,1024)).astype(np.float32)) * 1.1
cache_size = max(embedding_cache_size, 1024*1024)

In [None]:
h5file_path = f"{embeddings_path}/world_anti_embeddings.hdf5"

embedding_h5file = h5py.File(h5file_path, "r", rdcc_nbytes=cache_size)
embeddings = embedding_h5file['embeddings']

# embedding_h5file.close()

## Data Saving

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

sp = WhiteSpacePreprocessingStopwords(world_anti_tweets['text'].values.tolist(), stop_words, 10000)
preprocessed_documents, unpreprocessed_documents, vocab, retained_indices = sp.preprocess()



In [None]:
pd.DataFrame({'docs': np.array(preprocessed_documents)}).to_parquet(
    f"{data_saving_path}/preprocessed_documents.parquet", index=False)

In [None]:
retained_indices = np.array(retained_indices)

unpreprocessed_documents = np.array(unpreprocessed_documents)

unpreprocessed_documents = pd.DataFrame({'retained_indices': retained_indices,
              'text': unpreprocessed_documents})

unpreprocessed_documents.to_parquet(f"{data_saving_path}/sp_preprocess_results.parquet", index=False)

In [None]:
qt = TopicModelDataPreparation(None, retained_indices, show_warning=True)
training_dataset = qt.fit(text_for_contextual=unpreprocessed_documents['text'].values.tolist(),
                                text_for_bow=preprocessed_documents,
                                custom_embeddings=embeddings)



In [None]:
training_dataset.X_contextual = None
with open(f'{data_saving_path}/training_dataset.pkl', 'wb') as f:
    pickle.dump(training_dataset, f)

with open(f'{data_saving_path}/qt.pkl', 'wb') as f:
    pickle.dump(qt, f)

In [None]:
training_dataset.X_contextual = embeddings

# Data Loading

In [None]:
with open(f'{data_saving_path}/training_dataset.pkl', 'rb') as f:
    training_dataset = pickle.load(f)

training_dataset.X_contextual = embeddings

with open(f'{data_saving_path}/qt.pkl', 'rb') as f:
    qt = pickle.load(f)

In [None]:
sp_preprocess_results = pd.read_parquet(f"{data_saving_path}/sp_preprocess_results.parquet")
retained_indices = sp_preprocess_results['retained_indices'].values
unpreprocessed_documents = sp_preprocess_results['text'].values

del sp_preprocess_results

# Hyperparameter-Tuning

In [None]:
param_grid = {'k_numbers':list(range(5, 11)),
              'hidden_dimensions': [(200, 200), (500, 500), (700, 700)],
              'dropout': [0.2, 0.5, 0.8]}

grid_search = list(ParameterGrid(param_grid))

## Training

In [None]:
for grid in tqdm(grid_search):
    model_name = f"model_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}.pth"

    model_path = model_saving_path + "/" + model_name
    if os.path.exists(model_path):
        continue

    ctm = ZeroShotTM(bow_size=len(qt.vocab), contextual_size=embeddings.shape[1],
                     n_components=grid['k_numbers'], num_epochs=2,
                     hidden_sizes=grid['hidden_dimensions'], dropout=grid['dropout'], shuffle=False)
    ctm.fit(training_dataset, n_samples=None)

    topic_list = ctm.get_topic_lists(50)

    columns = [f"topic_{i+1}" for i in range(grid['k_numbers'])]
    topic_list = pd.DataFrame(np.array(topic_list).T, columns=columns)
    topic_list.to_csv(f"{topics_saving_path}/topics_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}", index=False)

    ctm.train_data = None
    ctm.save(model_path)

100%|██████████| 27/27 [00:00<00:00, 5163.28it/s]


# Coherence and Topic Diversity

In [None]:
preprocessed_documents = pd.read_parquet(f"{data_saving_path}/preprocessed_documents.parquet")
texts = [i.split() for i in preprocessed_documents['docs'].values]

del preprocessed_documents

In [None]:
coherence, topic_diversity = [], []
for grid in tqdm(grid_search[7:]):

    topic_list = pd.read_csv(f"{topics_saving_path}/topics_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}_{grid['beta']}.csv").values.T

    npmi = CoherenceNPMI(texts=texts, topics=topic_list)
    coherence.append(npmi.score())

    td = TopicDiversity(topic_list)
    topic_diversity.append(td.score(topk=50))

100%|██████████| 20/20 [1:42:23<00:00, 307.18s/it]


In [None]:
with open(f"{topics_saving_path}/coherence_topic_diversity.pkl", 'wb') as f:
    pickle.dump([coherence, topic_diversity], f)