In [1]:
%load_ext autoreload
%autoreload 1

import sys
import os
from dotenv import load_dotenv
load_dotenv()
sys.path.append('../../src')

from data.fetch_data import get_submission_docs_for_subreddit
from data.clean_data import process_text

%aimport data.fetch_data
%aimport data.clean_data

In [2]:
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, MWETokenizer # multi-word expression
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from gensim import corpora, models, similarities, matutils

import pickle

In [3]:
pd.set_option('display.max_columns', 50)

In [8]:
data_raw = get_submission_docs_for_subreddit('democrats')

Percent of comments by most prolific user: 0.02629016553067186


# Pre-Processing

## Cleaning

In [9]:
data_clean = data_raw.copy()
data_clean.text = data_clean.text.map(process_text)

In [10]:
with open('../../data/interim/rDemocrats_data_clean.pickle', 'wb') as write_file:
    pickle.dump(data_clean, write_file)

## Tokenize

In [11]:
additional_stop_words = [
    'like', 'dont', 'im', 'say', 'did', 'said', 'thats', 'don', 'hes', 'does', 'thing', 'gt', 'sure', 'doesnt',
    'saying', 'youre', 'isnt', 'doing', 'got', 'didnt', 'yeah', 'just', 'yes',
    'right', 'think', 'going', 'want', 'know', 'good',
    'need', 'time', 'point', 'make', 'way', 'really',
    'id', 'ar', 's', 't', 've', 'm', 'shes', 
    'c', 'd', 'v', 'actually', 'look', 'maybe', 'though', 'bad', 'came', 'mods', 'things', 'lot', 'let', 'lol', 'tell', 'pretty', 'literally'
    'theyre', 'people',
    '‘', '’', '“'
]
multi_words = [
    ('health','insurance'),
    ('fox', 'news'),
    ('bernie', 'sanders'),
    ('hillary', 'clinton'),
    ('barack', 'obama'),
    ('donald', 'trump'),
    ('joe', 'biden'),
    ('joseph', 'biden'),
    ('mass', 'shooting'),
    ('mass', 'shootings'),
    ('assault', 'weapon'),
    ('assault', 'weapons'),
    ('assault', 'weapons', 'ban'),
    ('sergeant', 'at', 'arms'),
    ('stop', 'and', 'frisk'),
    ('medicare', 'for', 'all'),
    ('public', 'option'),
    ('beat', 'trump'),
    ('articles', 'of', 'impeachment'),
    ('new', 'york'),
    ('hold', 'in', 'contempt'),
    ('quid', 'pro', 'quo')
]

In [12]:
# could do stemming, lemmatization, parts of speech, compound term extraction / named entity extraction, IF-IDF
# lots of emoji's' now with nltk tokenizer

# Topic Modeling

In [13]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '", topic_names[ix], "'")
        print(", ".join([feature_names[i]
                         for i in topic.argsort()[:-no_top_words - 1:-1]]))

## LSA

### Tokenizing

In [14]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = CountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [15]:
data_dtm_raw.iloc[:5, -171:-150]

Unnamed: 0,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,–,—,—and,—donald,—gt,—gtevery,—the,—trump,—you,―
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-170]

### Dim Reduction

In [17]:
topic_names_lsa = ['Election', 'Impeachment', 'Unclear - tax policy',
                   'Unclear - maybe Health Care', 'Gun Violence/Assault Weapons ban']

lsa = TruncatedSVD(5)
doc_topic_lsa = lsa.fit_transform(data_dtm)
print('Explained Variance Ratio: ', lsa.explained_variance_ratio_)
display_topics(
    lsa,
    cv.get_feature_names()[:-171],
    30,
    topic_names=topic_names_lsa
)

Explained Variance Ratio:  [0.08225951 0.04998326 0.03530142 0.03349842 0.02441368]

Topic: ' Election '
vote, bernie, biden, president, party, republicans, election, republican, democrats, candidate, better, warren, years, senate, support, house, sanders, money, democratic, country, voting, power, shit, voters, person, political, win, hillary, white, state

Topic: ' Impeachment '
power, congress, contempt, house, detain, court, senate, person, arrest, supreme, hold, law, impeachment, sergeant_at_arms, authority, republicans, vote, case, inherent, president, majority, dc, jail, impeached, persons, session, constitution, democrats, pursuant, longer

Topic: ' Unclear - tax policy '
bernie, biden, warren, sanders, congress, contempt, detain, candidate, power, arrest, supporters, court, vote, candidates, house, supreme, win, polls, person, sergeant_at_arms, voters, hillary, hold, authority, primary, inherent, pete, campaign, session, castro

Topic: ' Unclear - maybe Health Care '
healthcar

In [18]:
Vt = pd.DataFrame(doc_topic_lsa.round(5),
                  #              index = example,
                  columns=topic_names_lsa)
Vt

Unnamed: 0,Election,Impeachment,Unclear - tax policy,Unclear - maybe Health Care,Gun Violence/Assault Weapons ban
0,5.22163,0.80219,-0.72862,-1.69740,0.79301
1,6.22623,-0.30035,-3.15770,0.99486,-0.46923
2,11.30386,-4.24771,4.54649,-0.44931,1.82468
3,59.92904,-1.48703,-6.81543,-23.82862,9.03518
4,12.58006,-6.09968,-1.32912,10.36532,0.29093
...,...,...,...,...,...
599,14.90650,4.25741,-1.51642,-4.42267,-1.13338
600,12.09701,4.63515,-2.50924,-2.45817,-1.09026
601,6.02908,2.01382,-1.60848,-2.55960,-0.86775
602,6.77289,-0.53095,-0.18075,-0.98894,0.24169


## NMF

### Tokenizing

In [19]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = CountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [20]:
data_dtm_raw.iloc[:5, -172:-150]

Unnamed: 0,zum,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,–,—,—and,—donald,—gt,—gtevery,—the,—trump,—you,―
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-171]

### Dim Reduction

In [22]:
nmf_topic_labels = [
    'frustration at 2016 election',
    'impeachment hearings',
    'election (candidates)',
    'healthcare',
    'gun control',
    'election (high level general terms)',
    'yang',
    'impeachment',
    'right wing media',
    'debate',
    'bipartisanship????',
    'midwest elections?????',
    'identity',
    'Bloomberg',
    'biden/ukraine',
    'economy; trump vs. Obama credit',
    'election (states)',
    'monetary policy',
]

In [23]:
nmf_model = NMF(18, random_state=42)
doc_topic_nmf = nmf_model.fit_transform(data_dtm)

display_topics(
    nmf_model,
    cv.get_feature_names()[:-171],
    20,
    topic_names=nmf_topic_labels
)


Topic: ' frustration at 2016 election '
trump, election, win, voted, hillary, won, vote, fuck, shit, trumps, voters, president, campaign, best, man, candidate, money, clinton, world, stupid

Topic: ' impeachment hearings '
congress, power, contempt, detain, house, court, person, arrest, supreme, senate, hold, sergeant_at_arms, authority, vote, law, inherent, case, dc, majority, persons

Topic: ' election (candidates) '
warren, sanders, bernie, biden, candidate, years, candidates, progressive, great, vote, far, goes, president, campaign, best, better, midwest, support, policy, literally

Topic: ' healthcare '
cost, healthcare, billion, pay, taxes, percent, spending, costs, health, hospitals, care, insurance, tax, private, medicare, million, medical, services, paid, trillion

Topic: ' gun control '
gun, guns, ban, weapons, laws, used, rifles, democrats, away, amendment, rights, number, control, common, use, stop, beto, firearms, government, owners

Topic: ' election (high level general 

In [24]:
data_topic = pd.DataFrame(doc_topic_nmf.round(5),
                          #              index = example,
                          columns=nmf_topic_labels)

In [25]:
data_topic.mean().sort_values(ascending=False)

frustration at 2016 election           0.230862
impeachment                            0.222490
election (states)                      0.205085
right wing media                       0.198162
bipartisanship????                     0.178529
election (high level general terms)    0.166465
debate                                 0.135731
biden/ukraine                          0.116340
identity                               0.107509
midwest elections?????                 0.101219
Bloomberg                              0.101086
gun control                            0.096992
yang                                   0.085711
economy; trump vs. Obama credit        0.077292
healthcare                             0.076765
monetary policy                        0.066882
election (candidates)                  0.065139
impeachment hearings                   0.055271
dtype: float64

### With Lemmatization

In [43]:
stemmer = WordNetLemmatizer()

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

In [44]:
mwe_tokenizer = MWETokenizer(multi_words)

def complete_tokenizer(x):
    return mwe_tokenizer.tokenize(word_tokenize(x))

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = StemmedCountVectorizer(
    stop_words=stop_words,
    tokenizer=complete_tokenizer,
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [45]:
data_dtm_raw.iloc[:5, -172:-150]

Unnamed: 0,zum,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,–,—,—and,—donald,—gt,—gtevery,—the,—trump,—you,―
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-171]

In [47]:
nmf_topic_labels = [
    '2016_election_frustration',
    'impeachment_proceedings',
    'healthcare',
    'primary_candidates',
    'gun_control',
    'election_general_terms',
    'right_wing_media',
    'impeachment',
    'yang_ubi',
    'primary_debates',
    'bloomberg',
    'econ_trump_vs_obama',
    'race_identity',
    'tax_return_ukraine_biden',
    'election_midwest_swing',
    'monetary_policy',
    'rep_dem_comparison',
    'miltary_and_immigration'
]

In [48]:
nmf_model = NMF(18, random_state=42, alpha=0)
doc_topic_nmf = nmf_model.fit_transform(data_dtm)

display_topics(
    nmf_model,
    cv.get_feature_names(),
    20,
    topic_names=nmf_topic_labels
)


Topic: ' 2016_election_frustration '
trump, voted, win, fuck, supporter, election, hillary, vote, shit, lie, republican, war, believe, president, world, money, stupid, won, year, better

Topic: ' impeachment_proceedings '
power, congress, contempt, person, detain, court, house, arrest, supreme, senate, law, hold, sergeant_at_arms, authority, vote, jail, inherent, case, majority, dc

Topic: ' healthcare '
cost, tax, billion, healthcare, hospital, pay, rate, percent, spending, health, insurance, service, care, increase, medicare, million, medical, saving, private, paid

Topic: ' primary_candidates '
bernie, warren, sander, candidate, biden, supporter, vote, support, primary, progressive, poll, year, win, democratic, delegate, voter, hillary, campaign, literally, obama

Topic: ' gun_control '
gun, ban, law, weapon, rifle, firearm, democrat, death, used, mean, amendment, away, number, owner, right, control, common, use, stop, beto

Topic: ' election_general_terms '
vote, voting, voter, st

**Winning Topic Model**

In [49]:
import pickle

with open('../../data/interim/rDemocrats_nmf.pickle', 'wb') as write_file:
    pickle.dump(nmf_model, write_file)
    
with open('../../data/interim/rDemocrats_CV.pickle', 'wb') as write_file:
    pickle.dump(cv, write_file)
    
with open('../../data/interim/rDemocrats_doc_topic.pickle', 'wb') as write_file:
    pickle.dump(pd.DataFrame(doc_topic_nmf, columns=nmf_topic_labels), write_file)

### With TF-IDF

In [33]:
stemmer = WordNetLemmatizer()

class StemmedCountVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

In [34]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = StemmedCountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [35]:
nmf_model = NMF(14, random_state=42, alpha=0)
doc_topic_nmf = nmf_model.fit_transform(data_dtm)

display_topics(
    nmf_model,
    cv.get_feature_names()[:-171],
    20,
#     topic_names=nmf_topic_labels
)


Topic  0
trump, republican, voted, hillary, election, win, supporter, president, fuck, vote, shit, lie, year, democrat, biden, won, money, stupid, matter, war

Topic  1
power, congress, contempt, person, detain, court, house, arrest, supreme, senate, law, hold, sergeant_at_arms, authority, vote, jail, inherent, case, majority, dc

Topic  2
cost, tax, billion, healthcare, hospital, pay, rate, percent, spending, health, insurance, service, care, increase, medicare, million, medical, private, saving, paid

Topic  3
bernie, warren, sander, candidate, biden, supporter, support, vote, primary, hillary, progressive, year, poll, win, democratic, voter, clinton, delegate, literally, obama

Topic  4
gun, ban, law, weapon, rifle, firearm, democrat, death, used, mean, amendment, away, number, owner, right, use, control, stop, common, beto

Topic  5
vote, voting, voter, election, state, party, candidate, black, voted, win, right, country, responsibility, democratic, ballot, government, reason, wor

## LDA

### Tokenizing

In [36]:
data_clean = data_raw.copy()
data_clean.text = data_clean.text.map(process_text)

In [37]:
stemmer = WordNetLemmatizer()

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

In [38]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = StemmedCountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [39]:
data_dtm_raw.iloc[:5, -172:-122]

Unnamed: 0,zum,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,–,—,—and,—donald,—gt,—gtevery,—the,—trump,—you,―,”,„,•,…,‪can,⁉️,€,≠,☺,✊,❄️,❌❌❌,❤,❤️,❤️❤️,⢀⣠⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠀⠀⠀⠀⣠⣤⣶⣶,⣿⣿⣿⣿⣿⡏⠉⠛⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⣿,⣿⣿⣿⣿⣿⣿⠀⠀⠀⠈⠛⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠛⠉⠁⠀⣿,⣿⣿⣿⣿⣿⣿⣧⡀⠀⠀⠀⠀⠙⠿⠿⠿⠻⠿⠿⠟⠿⠛⠉⠀⠀⠀⠀⠀⣸⣿,⣿⣿⣿⣿⣿⣿⣿⣷⣄⠀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⠃⠀⠀⠈⠉⠀⠀⠤⠄⠀⠀⠀⠉⠁⠀⠀⠀⠀⢿⣿⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⡀⠉⠀⠀⠀⠀⠀⢄⠀⢀⠀⠀⠀⠀⠉⠉⠁⠀⠀⣿⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⡟⠀⠀⢰⣹⡆⠀⠀⠀⠀⠀⠀⣭⣷⠀⠀⠀⠸⣿⣿⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⢾⣿⣷⠀⠀⠀⠀⡠⠤⢄⠀⠀⠀⠠⣿⣿⣷⠀⢸⣿⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⣧⠀⠀⠀⠀⠀⠀⠀⠈⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢹⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠠⣴⣿⣿⣿⣿,⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠀⠀⠀⢰⣿⣿⣿⣿
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-171]
data_cv = data_cv[:, :-171]

### Dim Reduction

In [41]:
doc_word = data_cv.transpose()
# haven't removed the russian and stuff
corpus = matutils.Sparse2Corpus(doc_word)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [42]:
lda = models.LdaModel(corpus=corpus, num_topics=18, id2word=id2word, passes=100)

KeyboardInterrupt: 

In [None]:
lda.print_topics(num_words=20)