Prototype WIP

# Topic Prototyping

Prototype code for Topic Modeling of Posts. Objective is to find the best topic model for this data by visually inspect most promising models.

In [1]:
import os, re, string, pickle, random

import numpy as np
import pandas as pd

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump, load

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
random_state = 23

## Loading

In [17]:
data_path = "/mnt/DATA/NRW2019 Dropbox/data 4good/CSVData"
model_path = "../models"
figures_path = "../reports/figures/"

In [4]:
posts_fpath = os.path.join(data_path, "PolPosts.csv")
#comments_fpath = os.path.join(data_path, "UserComments.csv")

raw_posts = pd.read_csv(posts_fpath)
#raw_comments = pd.read_csv(comments_fpath)
print("Posts", raw_posts.shape)
#print("Comments", raw_comments.shape)

Posts (27204, 23)


## Preprocessing

Dataset cleansing

In [5]:
# Remove unnecessary cols and rename them uniformly
posts_cols = raw_posts.columns.to_list()
posts_cols.remove("text")
posts_cols.remove("textID") # keep these cols
raw_posts.drop(posts_cols, axis=1, inplace=True)
raw_posts.columns = ["text", "textID"]

corpus = raw_posts.copy()

# Remove empty texts
corpus.text.replace("", np.nan, inplace=True)
corpus.dropna(subset=["text"], inplace=True)
corpus = corpus[~corpus.text.str.isspace()]

# Remove duplicated texts
corpus.drop_duplicates(subset=["text"], keep="first", inplace=True)

corpus.shape

(22534, 2)

## Exploration

NaN Stats

In [6]:
corpus.isna().sum()

text      0
textID    0
dtype: int64

Duplicated texts (after cleaning)

In [7]:
corpus.text.duplicated().value_counts()

False    22534
Name: text, dtype: int64

Duplicated textIDs

In [8]:
corpus.textID.duplicated().value_counts()

False    22534
Name: textID, dtype: int64

## Topic Modeling

TFIDF + LDA

In [9]:
data = corpus.copy()

### Vectorizer

In [10]:
from spacy_tokenize import topic_tokenizer

tfidf = TfidfVectorizer(lowercase=True, tokenizer=topic_tokenizer, max_features=40000) # total tokens 43656
tfidf_v = tfidf.fit_transform(data.text)

dump(tfidf, os.path.join(model_path, "topic_vectorizer", "tfidf.joblib"))
dump(tfidf_v, os.path.join(model_path, "topic_vectorizer", "tfidf_v.joblib"))

['../models/topic_vectorizer/tfidf_v.joblib']

In [11]:
#len(tfidf.vocabulary_)

### Model

In [12]:
models = {
    "lda_40": LatentDirichletAllocation(n_components=40, n_jobs=1, random_state=random_state, verbose=1),
    "lda_50": LatentDirichletAllocation(n_components=50, n_jobs=1, random_state=random_state, verbose=1),
    "lda_80": LatentDirichletAllocation(n_components=80, n_jobs=1, random_state=random_state, verbose=1),
    "lda_100": LatentDirichletAllocation(n_components=100, n_jobs=1, random_state=random_state, verbose=1)
}

Fit models

In [16]:
lda_path = os.path.join(model_path, "topic_lda")

for model in models:
    print("Fitting", model)
    models[model].fit(tfidf_v)
    print("Dump", model, "\n")
    dump(models[model], os.path.join(lda_path, model + ".joblib"))

Fitting lda_40
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Dump lda_40 

Fitting lda_50
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Dump lda_50 

Fitting lda_80
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Dump lda_80 

Fitting lda_100
iteration: 1 of max_iter

In [None]:
# lda_result = model.fit_transform(tfidf_v)

# data["topic"] = lda_result.argmax(axis=1)

# persist_fpath = os.path.join(data_path, "topics", "topics.csv")
# data.to_csv(persist_fpath, index=False)

## Evaluation

Scores

In [18]:
def print_scores(model, name):
    """ Prints Log-Likelihoodk and Perplexity scors of model. """
    print("Scores for", name)
    print("Log Likelihood:", model.score(tfidf_v))
    print("Perplexity:", model.perplexity(tfidf_v), "\n")
    
    
for model in models:
    print_scores(models[model], model)

Scores for lda_40
Log Likelihood: -921764.8500796544
Perplexity: 1500134.9756725011 

Scores for lda_50
Log Likelihood: -956078.2427735862
Perplexity: 2547073.356606655 

Scores for lda_80
Log Likelihood: -1044411.9398234173
Perplexity: 9951912.118990524 

Scores for lda_100
Log Likelihood: -1121066.3056666045
Perplexity: 32472536.777021836 



In [20]:
# viz_path = os.path.join(figures_path, "topics")

# for model in models:
#     p = None
#     p = pyLDAvis.sklearn.prepare(models[model], tfidf_v, tfidf, mds="tsne")
#     pyLDAvis.save_html(p, os.path.join(viz_path, "topics_" + model + ".html"))
#     print("topics_" + model + " persisted")

p = pyLDAvis.sklearn.prepare(models["lda_100"], tfidf_v, tfidf, mds="tsne")
pyLDAvis.save_html(p, os.path.join(viz_path, "topics_" + model + ".html"))



TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

# Conclusion

