In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import os
import sys
import h5py
import torch
import pickle
import numpy as np
import pandas as pd
import pyLDAvis as vis
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from contextualized_topic_models.models.ctm import ZeroShotTM

  from collections import Iterable


# Loading Data

In [None]:
embeddings_path = "/data/processed/embeddings"
probs_saving_path = "/data/processed/CTM/probs"
plot_saving_path = "/data/processed/CTM/ldavis_figures"
model_saving_path = '/models/ctm_models'
data_saving_path = "/data/processed/CTM"

In [None]:
world_anti_tweets = pd.read_parquet("/data/raw/world_anti_tweets_and_ids.parquet")

In [None]:
embedding_cache_size = sys.getsizeof(np.random.random((1024,1024)).astype(np.float32)) * 1.1
cache_size = max(embedding_cache_size, 1024*1024) # 97602

In [None]:
h5file_path = f"{embeddings_path}/world_anti_embeddings.hdf5"

embedding_h5file = h5py.File(h5file_path, "r", rdcc_nbytes=cache_size)
embeddings = embedding_h5file['embeddings']

# embedding_h5file.close()

In [None]:
embeddings.shape

(9760275, 1024)

# Hyperparameter-Tuning

In [None]:
param_grid = {'k_numbers':list(range(5, 11)),
              'hidden_dimensions': [(200, 200), (500, 500), (700, 700)],
              'dropout': [0.2, 0.5, 0.8]}

grid_search = list(ParameterGrid(param_grid))

## Data Loading

In [None]:
with open(f'{data_saving_path}/training_dataset.pkl', 'rb') as f:
    training_dataset = pickle.load(f)

training_dataset.X_contextual = embeddings

with open(f'{data_saving_path}/qt.pkl', 'rb') as f:
    qt = pickle.load(f)

In [None]:
sp_preprocess_results = pd.read_parquet(f"{data_saving_path}/sp_preprocess_results.parquet")
retained_indices = sp_preprocess_results['retained_indices'].values
unpreprocessed_documents = sp_preprocess_results['text'].values

del sp_preprocess_results

In [None]:
for grid in tqdm(grid_search):

    model_name = f"model_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}.pth"
    model_path = model_saving_path + "/" + model_name
    if os.path.exists(f"{probs_saving_path}/probs_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}.csv"):
        continue

    if not os.path.exists(model_path):
        print(f"{model_name} doesn't exists!")
        continue

    ctm = ZeroShotTM(bow_size=len(qt.vocab), contextual_size=embeddings.shape[1],
                     n_components=grid['k_numbers'], num_epochs=2,
                     hidden_sizes=grid['hidden_dimensions'], dropout=grid['dropout'], shuffle=False)
    ctm.load(model_path)
    ctm.train_data = training_dataset
    ctm.USE_CUDA = False
    ctm.device = torch.device('cpu')
    ctm.batch_size = 1024

    topics_predictions = ctm.get_thetas(training_dataset)
    columns = [f"topic_{i+1}" for i in range(grid['k_numbers'])]
    topics_predictions = pd.DataFrame(topics_predictions, columns=columns)
    topics_predictions.to_parquet(f"{probs_saving_path}/probs_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}.parquet", index=False)

    tops = []
    for topic in topics_predictions.columns:
        tops.append(np.argsort(topics_predictions[topic].values)[::-1][:100])

    top_tweets_saving_path = f"{data_saving_path}/top_tweets/{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}"
    os.makedirs(top_tweets_saving_path, exist_ok=True)

    for idx, i in enumerate(tops):
        top_100_tweets = pd.DataFrame({'text': unpreprocessed_documents[i]})
        top_100_tweets.to_csv(f"{top_tweets_saving_path}/topic_{idx+1}.csv", index=False)

    lda_vis_data = ctm.get_ldavis_data_format(qt.vocab, training_dataset, 20, topics_predictions.values)
    ctm_pd = vis.prepare(sort_topics=False, **lda_vis_data)
    vis.save_html(ctm_pd, f"{plot_saving_path}/lda_vis_figure_{grid['k_numbers']}_{grid['hidden_dimensions'][0]}_{grid['dropout']}.html")