Prototype finished.

# Topic Prototyping

Prototype code for Topic Modeling of Posts. Objective is to find the best topic model for this data by visually inspect most promising models.

In [None]:
import os, re, string, pickle, random

import numpy as np
import pandas as pd

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump, load

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
random_state = 23

## Loading

In [None]:
data_path = "/mnt/DATA/NRW2019 Dropbox/data 4good/CSVData"
model_path = "../models"
figures_path = "../reports/figures/"

In [None]:
posts_fpath = os.path.join(data_path, "PolPosts.csv")

raw_posts = pd.read_csv(posts_fpath)
print("Posts", raw_posts.shape)

## Preprocessing

Dataset cleansing

In [None]:
# Remove unnecessary cols and rename them uniformly
posts_cols = raw_posts.columns.to_list()
posts_cols.remove("text")
posts_cols.remove("textID") # keep these cols
raw_posts.drop(posts_cols, axis=1, inplace=True)
raw_posts.columns = ["text", "textID"]

corpus = raw_posts.copy()

# Remove empty texts
corpus.text.replace("", np.nan, inplace=True)
corpus.dropna(subset=["text"], inplace=True)
corpus = corpus[~corpus.text.str.isspace()]

# Remove duplicated texts
corpus.drop_duplicates(subset=["text"], keep="first", inplace=True)

corpus.shape

## Exploration

NaN Stats

In [None]:
corpus.isna().sum()

Duplicated texts (after cleaning)

In [None]:
corpus.text.duplicated().value_counts()

Duplicated textIDs

In [None]:
corpus.textID.duplicated().value_counts()

## Topic Modeling

TFIDF + LDA

In [None]:
data = corpus.copy()

### Vectorizer

In [None]:
from spacy_tokenize import topic_tokenizer

tfidf = TfidfVectorizer(lowercase=True, tokenizer=topic_tokenizer, max_features=40000) # total tokens 43656
tfidf_v = tfidf.fit_transform(data.text)

dump(tfidf, os.path.join(model_path, "topic_vectorizer", "tfidf.joblib"))
dump(tfidf_v, os.path.join(model_path, "topic_vectorizer", "tfidf_v.joblib"))

In [None]:
#len(tfidf.vocabulary_)

### Model

In [None]:
models = {
    "lda_20": LatentDirichletAllocation(n_components=20, n_jobs=1, random_state=random_state, verbose=1),
    "lda_25": LatentDirichletAllocation(n_components=25, n_jobs=1, random_state=random_state, verbose=1),
    "lda_30": LatentDirichletAllocation(n_components=30, n_jobs=1, random_state=random_state, verbose=1),
    "lda_40": LatentDirichletAllocation(n_components=40, n_jobs=1, random_state=random_state, verbose=1),
    "lda_50": LatentDirichletAllocation(n_components=50, n_jobs=1, random_state=random_state, verbose=1),
    "lda_80": LatentDirichletAllocation(n_components=80, n_jobs=1, random_state=random_state, verbose=1),
    "lda_100": LatentDirichletAllocation(n_components=100, n_jobs=1, random_state=random_state, verbose=1)
}

Fit models

In [None]:
lda_path = os.path.join(model_path, "topic_lda")

for model in models:
    print("Fitting", model)
    models[model].fit(tfidf_v)
    print("Dump", model, "\n")
    dump(models[model], os.path.join(lda_path, model + ".joblib"))

## Evaluation

Scores

In [None]:
def print_scores(model, name):
    """ Prints Log-Likelihoodk and Perplexity scors of model. """
    print("Scores for", name)
    print("Log Likelihood:", model.score(tfidf_v))
    print("Perplexity:", model.perplexity(tfidf_v), "\n")
    
    
for model in models:
    print_scores(models[model], model)

Visualization

In [None]:
viz_path = os.path.join(figures_path, "topics")

for model in models:
    if model == "lda_100": # skip due to performance constraints (out of memory)
        pass
    else:
        p = None
        p = pyLDAvis.sklearn.prepare(models[model], tfidf_v, tfidf, mds="mmds")#"tsne")
        pyLDAvis.save_html(p, os.path.join(viz_path, "topics_" + model + ".html"))
        print("topics_" + model + " persisted")

# Conclusion

Visual inspection and interpretation of models leads to 3 most promising candidates.  
These are:
* 25 topics - best dense representation.
* 30 topics - best seperation.
* 40 topics - best interpretability.

As resulting topics of the model consisting of 40 components are best interpretable and even the dichotomy of left- & right-wing topics can be seperated (along PC1 axis of t-SNE plot), this model has been chosen.