In [7]:
%load_ext autoreload
%autoreload 1

import sys
import os
from dotenv import load_dotenv
load_dotenv()
sys.path.append('../src')

from data.fetch_data import get_submission_docs_for_subreddit
from data.clean_data import process_text

%aimport data.fetch_data
%aimport data.clean_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, MWETokenizer # multi-word expression
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from gensim import corpora, models, similarities, matutils

import pickle

In [3]:
pd.set_option('display.max_columns', 50)

In [15]:
data_raw = get_submission_docs_for_subreddit('democrats')

Percent of comments by most prolific user: 0.02629016553067186


# Pre-Processing

## Cleaning

In [19]:
data_clean = data_raw.copy()
data_clean.text = data_clean.text.map(process_text)

In [20]:
with open('rDemocrats_data_clean.pickle', 'wb') as write_file:
    pickle.dump(data_clean, write_file)

## Tokenize

In [11]:
additional_stop_words = [
    'like', 'dont', 'im', 'say', 'did', 'said', 'thats', 'don', 'hes', 'does', 'thing', 'gt', 'sure', 'doesnt',
    'saying', 'youre', 'isnt', 'doing', 'got', 'didnt', 'yeah', 'just', 'yes',
    'right', 'think', 'going', 'want', 'know', 'good',
    'need', 'time', 'point', 'make', 'way', 'really',
    'id', 'ar', 's', 't', 've', 'm', 'shes', 
    'c', 'd', 'v', 'actually', 'look', 'maybe', 'though', 'bad', 'came', 'mods', 'things', 'lot', 'let', 'lol', 'tell', 'pretty', 'literally'
    'theyre', 'people',
    '‘', '’', '“'
]
multi_words = [
    ('health','insurance'),
    ('fox', 'news'),
    ('bernie', 'sanders'),
    ('hillary', 'clinton'),
    ('barack', 'obama'),
    ('donald', 'trump'),
    ('joe', 'biden'),
    ('joseph', 'biden'),
    ('mass', 'shooting'),
    ('mass', 'shootings'),
    ('assault', 'weapon'),
    ('assault', 'weapons'),
    ('assault', 'weapons', 'ban'),
    ('sergeant', 'at', 'arms'),
    ('stop', 'and', 'frisk'),
    ('medicare', 'for', 'all'),
    ('public', 'option'),
    ('beat', 'trump'),
    ('articles', 'of', 'impeachment'),
    ('new', 'york'),
    ('hold', 'in', 'contempt'),
    ('quid', 'pro', 'quo')
]

In [None]:
# could do stemming, lemmatization, parts of speech, compound term extraction / named entity extraction, IF-IDF
# lots of emoji's' now with nltk tokenizer

# EDA

In [None]:
word_counts = pd.DataFrame(np.sum(data_dtm.transpose(), axis=1), columns=['word_count'], index=data_dtm.transpose().index)

In [None]:
word_counts.word_count.sort_values(ascending=False).iloc[0:20]

In [None]:
word_counts.sample(50)

In [None]:
# lots of omitted spaces
# fair number of spelling errors
# problem is, TextBlob can't find omitted spaces and corrects things like "pelosi" to "pelvis"
# omit for now

# Topic Modeling

In [18]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '", topic_names[ix], "'")
        print(", ".join([feature_names[i]
                         for i in topic.argsort()[:-no_top_words - 1:-1]]))

## LSA

### Tokenizing

In [None]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = CountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [None]:
data_dtm_raw.iloc[:5, -171:-150]

In [None]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-170]

### Dim Reduction

In [None]:
topic_names_lsa = ['Election', 'Impeachment', 'Unclear - tax policy',
                   'Unclear - maybe Health Care', 'Gun Violence/Assault Weapons ban']

lsa = TruncatedSVD(5)
doc_topic_lsa = lsa.fit_transform(data_dtm)
print('Explained Variance Ratio: ', lsa.explained_variance_ratio_)
display_topics(
    lsa,
    cv.get_feature_names()[:-171],
    30,
    topic_names=topic_names_lsa
)

In [None]:
Vt = pd.DataFrame(doc_topic_lsa.round(5),
                  #              index = example,
                  columns=topic_names_lsa)
Vt

## NMF

### Tokenizing

In [10]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = CountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [11]:
data_dtm_raw.iloc[:5, -172:-150]

Unnamed: 0,zum,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,–,—,—and,—donald,—gt,—gtevery,—the,—trump,—you,―
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-171]

### Dim Reduction

In [None]:
nmf_topic_labels = [
    'frustration at 2016 election',
    'impeachment hearings',
    'election (candidates)',
    'healthcare',
    'gun control',
    'election (high level general terms)',
    'yang',
    'impeachment',
    'right wing media',
    'debate',
    'bipartisanship????',
    'midwest elections?????',
    'identity',
    'Bloomberg',
    'biden/ukraine',
    'economy; trump vs. Obama credit',
    'election (states)',
    'monetary policy',
]

In [None]:
nmf_model = NMF(18, random_state=42)
doc_topic_nmf = nmf_model.fit_transform(data_dtm)

display_topics(
    nmf_model,
    cv.get_feature_names()[:-171],
    20,
    topic_names=nmf_topic_labels
)

In [None]:
data_topic = pd.DataFrame(doc_topic_nmf.round(5),
                          #              index = example,
                          columns=nmf_topic_labels)

In [None]:
data_topic.mean().sort_values(ascending=False)

### With Lemmatization

In [9]:
stemmer = WordNetLemmatizer()

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

In [12]:
mwe_tokenizer = MWETokenizer(multi_words)

def complete_tokenizer(x):
    return mwe_tokenizer.tokenize(word_tokenize(x))

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = StemmedCountVectorizer(
    stop_words=stop_words,
    tokenizer=complete_tokenizer,
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [20]:
data_dtm_raw.iloc[:5, -172:-150]

Unnamed: 0,zum,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,–,—,—and,—donald,—gt,—gtevery,—the,—trump,—you,―
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-171]

In [15]:
nmf_topic_labels = [
    '2016_election_frustration',
    'impeachment_proceedings',
    'healthcare',
    'primary_candidates',
    'gun_control',
    'election_general_terms',
    'right_wing_media',
    'impeachment',
    'yang_ubi',
    'primary_debates',
    'bloomberg',
    'econ_trump_vs_obama',
    'race_identity',
    'tax_return_ukraine_biden',
    'election_midwest_swing',
    'monetary_policy',
    'rep_dem_comparison',
    'miltary_and_immigration'
]

In [23]:
nmf_model = NMF(18, random_state=42, alpha=0)
doc_topic_nmf = nmf_model.fit_transform(data_dtm)

display_topics(
    nmf_model,
    cv.get_feature_names(),
    20,
    topic_names=nmf_topic_labels
)


Topic: ' 2016_election_frustration '
trump, voted, win, fuck, supporter, election, hillary, vote, shit, lie, republican, war, believe, president, world, money, stupid, won, year, better

Topic: ' impeachment_proceedings '
power, congress, contempt, person, detain, court, house, arrest, supreme, senate, law, hold, sergeant_at_arms, authority, vote, jail, inherent, case, majority, dc

Topic: ' healthcare '
cost, tax, billion, healthcare, hospital, pay, rate, percent, spending, health, insurance, service, care, increase, medicare, million, medical, saving, private, paid

Topic: ' primary_candidates '
bernie, warren, sander, candidate, biden, supporter, vote, support, primary, progressive, poll, year, win, democratic, delegate, voter, hillary, campaign, literally, obama

Topic: ' gun_control '
gun, ban, law, weapon, rifle, firearm, democrat, death, used, mean, amendment, away, number, owner, right, control, common, use, stop, beto

Topic: ' election_general_terms '
vote, voting, voter, st

**Winning Topic Model**

In [25]:
import pickle

with open('rDemocrats_nmf.pickle', 'wb') as write_file:
    pickle.dump(nmf_model, write_file)
    
with open('rDemocrats_CV.pickle', 'wb') as write_file:
    pickle.dump(cv, write_file)
    
with open('rDemocrats_doc_topic.pickle', 'wb') as write_file:
    pickle.dump(pd.DataFrame(doc_topic_nmf, columns=nmf_topic_labels), write_file)

### With TF-IDF

In [None]:
stemmer = WordNetLemmatizer()

class StemmedCountVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

In [None]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = StemmedCountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [None]:
nmf_model = NMF(14, random_state=42, alpha=0)
doc_topic_nmf = nmf_model.fit_transform(data_dtm)

display_topics(
    nmf_model,
    cv.get_feature_names()[:-171],
    20,
#     topic_names=nmf_topic_labels
)

## LDA

### Tokenizing

In [None]:
data_clean = data.copy()
data_clean.text = data.text.map(clean_text)

In [None]:
stemmer = WordNetLemmatizer()

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

In [None]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = StemmedCountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    #     max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [None]:
data_dtm_raw.iloc[:5, -172:-122]

In [None]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-171]
data_cv = data_cv[:, :-171]

### Dim Reduction

In [None]:
doc_word = data_cv.transpose()
# haven't removed the russian and stuff
corpus = matutils.Sparse2Corpus(doc_word)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
lda = models.LdaModel(corpus=corpus, num_topics=18, id2word=id2word, passes=100)

In [None]:
lda.print_topics(num_words=20)