In [1]:
%load_ext autoreload
%autoreload 1

import sys
import os
from dotenv import load_dotenv
load_dotenv()
sys.path.append('../src')

from data.fetch_data import get_submission_docs_for_subreddit
from data.clean_data import process_text

%aimport data.fetch_data
%aimport data.clean_data

In [2]:
import pickle
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, MWETokenizer # multi-word expression
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from gensim import corpora, models, similarities, matutils

import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [3]:
with open('user_comments_with_sentiment.pickle', 'rb') as read_file:
    data_raw = pickle.load(read_file)

# Apply rDemocrats Topics

In [4]:
additional_stop_words = [
    'like', 'dont', 'im', 'say', 'did', 'said', 'thats', 'don', 'hes', 'does', 'thing', 'gt', 'sure', 'doesnt',
    'saying', 'youre', 'isnt', 'doing', 'got', 'didnt', 'yeah', 'just', 'yes',
    'right', 'think', 'going', 'want', 'know', 'good',
    'need', 'time', 'point', 'make', 'way', 'really',
    'id', 'ar', 's', 't', 've', 'm', 'shes', 
    'c', 'd', 'v', 'actually', 'look', 'maybe', 'though', 'bad', 'came', 'mods', 'things', 'lot', 'let', 'lol', 'tell', 'pretty', 'literally'
    'theyre', 'people',
    '‘', '’', '“'
]
multi_words = [
    ('health','insurance'),
    ('fox', 'news'),
    ('bernie', 'sanders'),
    ('hillary', 'clinton'),
    ('barack', 'obama'),
    ('donald', 'trump'),
    ('joe', 'biden'),
    ('joseph', 'biden'),
    ('mass', 'shooting'),
    ('mass', 'shootings'),
    ('assault', 'weapon'),
    ('assault', 'weapons'),
    ('assault', 'weapons', 'ban'),
    ('sergeant', 'at', 'arms'),
    ('stop', 'and', 'frisk'),
    ('medicare', 'for', 'all'),
    ('public', 'option'),
    ('beat', 'trump'),
    ('articles', 'of', 'impeachment'),
    ('new', 'york'),
    ('hold', 'in', 'contempt'),
    ('quid', 'pro', 'quo')
]
nmf_topic_labels = [
    '2016_election_frustration',
    'impeachment_proceedings',
    'healthcare',
    'primary_candidates',
    'gun_control',
    'election_general_terms',
    'right_wing_media',
    'impeachment',
    'yang_ubi',
    'primary_debates',
    'bloomberg',
    'econ_trump_vs_obama',
    'race_identity',
    'tax_return_ukraine_biden',
    'election_midwest_swing',
    'monetary_policy',
    'rep_dem_comparison',
    'miltary_and_immigration',
    'none'
]

In [5]:
stemmer = WordNetLemmatizer()
mwe_tokenizer = MWETokenizer(multi_words)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.lemmatize(w) for w in analyzer(doc)])

def complete_tokenizer(x):
    return mwe_tokenizer.tokenize(word_tokenize(x))

In [6]:
with open('rDemocrats_CV.pickle', 'rb') as read_file:
    cv_dems = pickle.load(read_file)
    
with open('rDemocrats_nmf.pickle', 'rb') as read_file:
    nmf_dems = pickle.load(read_file)

In [22]:
def get_topic_label(row):
    topic_weights = row['2016_election_frustration':'miltary_and_immigration']
    per_word = topic_weights / row['total_comment_length']
#     if np.max(per_word) > 0.0003:
    return nmf_topic_labels[np.argmax(topic_weights)]
#     else:
#         return 'none'
        

def label_primary_topic(df):
    df_ = df.copy()
    df_['primary_topic_num'] = df_.apply(get_topic_label, axis=1)
    return df_

In [23]:
def process_candidate(df):
    data_clean = df.copy()
    data_clean['text'] = data_clean.text.map(lambda x: ' '.join(x))
    data_clean.text = data_clean.text.map(process_text)
    
    data_cv = cv_dems.transform(data_clean.text)
    data_dtm_raw = pd.DataFrame(data_cv.toarray(), columns=cv_dems.get_feature_names())
    data_dtm_raw.index = data_clean.index
    
    data_dtm = data_dtm_raw.iloc[:, :-171]
    
    doc_topic_nmf = nmf_dems.transform(data_dtm)
    results = pd.DataFrame(doc_topic_nmf, columns=nmf_topic_labels[:-1])
    
    return label_primary_topic(results.join(data_clean.reset_index()))

## Label Comments

In [24]:
data_labeled = process_candidate(data_raw)

In [25]:
data_labeled.primary_topic_num.value_counts()

primary_candidates           3270
primary_debates               861
yang_ubi                      520
race_identity                 388
election_midwest_swing        357
election_general_terms        319
2016_election_frustration     250
impeachment                   216
right_wing_media              188
rep_dem_comparison            188
gun_control                   157
tax_return_ukraine_biden      146
miltary_and_immigration       142
healthcare                    125
econ_trump_vs_obama            60
bloomberg                      50
monetary_policy                34
impeachment_proceedings        12
Name: primary_topic_num, dtype: int64

In [26]:
with open('data_users_topic_labeled.pickle', 'wb') as write_file:
    pickle.dump(data_labeled, write_file)