Prototype WIP

# Topic Prototyping

Prototype code for Topic Modeling of Posts and Comments.

In [1]:
import os, re, string, pickle, random

import numpy as np
import pandas as pd

import spacy

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
random_state = 23
random.seed(random_state)

## Loading

In [3]:
data_path = "/mnt/DATA/NRW2019 Dropbox/data 4good/CSVData"
posts_fpath = os.path.join(data_path, "PolPosts.csv")
comments_fpath = os.path.join(data_path, "UserComments.csv")

raw_posts = pd.read_csv(posts_fpath)
raw_comments = pd.read_csv(comments_fpath)
print("Posts", raw_posts.shape)
print("Comments", raw_comments.shape)

Posts (27204, 23)
Comments (1138467, 21)


## Preprocessing

Dataset cleansing

In [9]:
# Remove unnecessary cols and rename them uniformly
posts_cols = raw_posts.columns.to_list()
posts_cols.remove("text") # keep these cols
raw_posts.drop(posts_cols, axis=1, inplace=True)
raw_posts.columns = ["text"]

comments_cols = raw_comments.columns.to_list()
comments_cols.remove("text") # keep these cols
raw_comments.drop(comments_cols, axis=1, inplace=True)
raw_comments.columns = ["text"]

# Merge Posts and Comments
corpus = pd.concat([raw_posts, raw_comments], axis=0, ignore_index=True)

# Remove empty texts
corpus.text.replace("", np.nan, inplace=True)
corpus.dropna(subset=["text"], inplace=True)
corpus = corpus[~corpus.text.str.isspace()]

# Remove duplicated texts
corpus.drop_duplicates(subset=["text"], keep="first", inplace=True)

corpus.shape

(835285, 1)

## Exploration

NaN Stats

In [13]:
corpus.isna().sum()

text    0
dtype: int64

Duplicated texts (after cleaning)

In [14]:
corpus.text.duplicated().value_counts()

False    835285
Name: text, dtype: int64

## Topic Modeling

TFIDF + LDA

In [15]:
data = corpus.copy()

### Vectorizer

In [18]:
from spacy_tokenize import topic_tokenizer

tfidf = TfidfVectorizer(lowercase=True, tokenizer=topic_tokenizer)
tfidf_v = tfidf.fit_transform(data.text)

### Model

In [24]:
lda = LatentDirichletAllocation(n_jobs=-1, random_state=random_state)

KeyboardInterrupt: 

Parameter Tuning

In [None]:
lda_params = {"n_components": [10, 70, 100]}

gs = GridSearchCV(lda, param_grid=lda_params, n_jobs=-1)
gs.fit(tfidf_v)

Fit best model

In [None]:
model = gs.best_estimator_

lda_result = model.transform(tfidf_v)

data["topic"] = lda_result.argmax(axis=1)

## Evaluation

In [20]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", model.best_score_)
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", model.perplexity(tfidf_v))

Log Likelihood:  -21287161.63626368
Perplexity:  84618.95892239454


In [22]:
data.topic.value_counts()

5    152126
8    113748
6    113696
1     99850
2     90528
0     67439
7     56231
4     49923
9     47955
3     43789
Name: topic, dtype: int64

In [23]:
data[data.topic == 0].text

9          Spannende Tage in Graz! Ich habe heute die @di...
10         Spannende Tage in Graz! Ich habe die @diegraze...
48         Orbán als Bewacher des christlichen Abendlande...
104                             👇🏼👇🏼 https://t.co/BwNTu3Fqnn
131        Das Soundfile existiert tatsächlich: https://t...
                                 ...                        
1165605                                   Chris Gruber \n✔👏🙋
1165611    Sabshon Bu wieso Paranoid? Es ist bekannt das ...
1165617    Kerstin Thyrian ganz lieben Dank für die Antwort!
1165662    @Anonymer_User @zeynemarslan Zeynebi Secin ve ...
1165663    @Anonymer_User @zeynemarslan Zeynep Arslana oy...
Name: text, Length: 67439, dtype: object

# Conclusion

