In [7]:
%load_ext autoreload
%autoreload 1

import sys
import os
from dotenv import load_dotenv
load_dotenv()
sys.path.append('../src')

from data.fetch_data import get_submission_docs_for_subreddit

%aimport data.fetch_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [216]:
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, MWETokenizer # multi-word expression

In [247]:
pd.set_option('display.max_columns', 50)

In [217]:
data = get_submission_docs_for_subreddit('democrats')

Percent of comments by most prolific user: 0.02629016553067186


# Pre-Processing

## Cleaning

In [318]:
def clean_text(text):
    '''Make text lowercase, remove punctuation, remove words containing numbers, remove links.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\w*http\w*|\w*www\w*', '', text)
    return text

In [319]:
data_clean = data.copy()
data_clean.text = data.text.map(clean_text)

## Tokenize

In [338]:
additional_stop_words = [
    'like', 'dont', 'im', 'say', 'did', 'said', 'thats', 'don', 'hes', 'does', 'thing', 'gt', 'sure', 'doesnt',
    'saying', 'youre', 'isnt', 'doing', 'got', 'didnt', 'yeah', 'just', 'yes',
    'right', 'think', 'going', 'want', 'know', 'good',
    'need', 'time', 'point', 'make', 'way', 'really',
    'id', 'ar', 's', 't', 've', 'm', 'shes',
]
multi_words = [
    ('health','insurance'),
    ('fox', 'news'),
    ('bernie', 'sanders'),
    ('hillary', 'clinton'),
    ('barack', 'obama'),
    ('donald', 'trump'),
    ('joe', 'biden'),
    ('joseph', 'biden'),
    ('mass', 'shooting'),
    ('mass', 'shootings'),
    ('assault', 'weapon'),
    ('assault', 'weapons'),
    ('assault', 'weapons', 'ban'),
    ('sergeant', 'at', 'arms'),
    ('stop', 'and', 'frisk'),
    ('medicare', 'for', 'all'),
    ('public', 'option'),
    ('beat', 'trump')
]

In [367]:
mwe_tokenizer = MWETokenizer(multi_words)

stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

cv = CountVectorizer(
    stop_words=stop_words,
    tokenizer=lambda x: mwe_tokenizer.tokenize(word_tokenize(x)),
    max_df=0.75
)
data_cv = cv.fit_transform(data_clean.text)
data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_raw.index = data_clean.index

In [371]:
data_dtm_raw.iloc[:5, -171:-150]

Unnamed: 0,zum,£,­,¯ツ¯,¿,¿philanthropist,élection,͜ʖ,͡ʘ,американец,говорить,как,​theres,–,–jackson–kingday,–nebraskaact,—,—and,—donald,—gt,—the
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [372]:
# remove russian words, emojis, other weird stuff
data_dtm = data_dtm_raw.iloc[:, :-170]

In [331]:
# could do stemming, lemmatization, parts of speech, compound term extraction / named entity extraction, IF-IDF
# lots of emoji's' now with nltk tokenizer

# EDA

In [202]:
word_counts = pd.DataFrame(np.sum(data_dtm.transpose(), axis=1), columns=['word_count'], index=data_dtm.transpose().index)

In [None]:
word_counts.word_count.sort_values(ascending=False).iloc[0:20]

In [None]:
word_counts.sample(50)

In [205]:
# lots of omitted spaces
# fair number of spelling errors
    # problem is, TextBlob can't find omitted spaces and corrects things like "pelosi" to "pelvis"
    # omit for now


# Topic Modeling

In [325]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## LSA

In [376]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(data_dtm)
print('Explained Variance Ratio: ', lsa.explained_variance_ratio_)
display_topics(
    lsa,
    cv.get_feature_names()[:-171],
    30,
    topic_names=['Election', 'Impeachment', 'Unclear - tax policy', 'Unclear - maybe Health Care', 'Gun Violence/Assault Weapons ban']
)

Explained Variance Ratio:  [0.08639577 0.04972536 0.03549208 0.03375841 0.02473922]

Topic: ' Election '
vote, bernie, biden, president, party, republicans, election, democrats, republican, candidate, years, warren, better, actually, senate, support, sanders, things, house, money, country, lot, democratic, voting, power, bad, shit, voters, person, win

Topic: ' Impeachment '
power, congress, house, contempt, detain, senate, court, person, arrest, supreme, impeachment, hold, law, republicans, sergeant_at_arms, vote, authority, president, case, inherent, majority, impeached, democrats, dc, constitution, jail, trial, persons, session, pelosi

Topic: ' Unclear - policy mix '
gun, tax, pay, cost, taxes, billion, government, healthcare, care, money, costs, health, spending, guns, insurance, percent, private, public, republicans, hospitals, million, use, ban, services, year, impeachment, medical, paid, increase, income

Topic: ' Unclear - maybe Health Care '
congress, healthcare, billion, ber

In [None]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = example,
             columns = ["component_1","component_2" ])
Vt

## NMF

In [None]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(data_dtm)
print('Explained Variance Ratio: ', lsa.explained_variance_ratio_)
display_topics(
    lsa,
    cv.get_feature_names()[:-171],
    30,
    topic_names=['Election', 'Impeachment', 'Unclear - tax policy', 'Unclear - maybe Health Care', 'Gun Violence/Assault Weapons ban']
)