In [138]:
from collections import Counter
import re
import dill
import numpy as np
import pandas as pd
import sklearn

In [350]:
# with open('data/{}_articles.pkl'.format('left'), 'rb') as f:
#     data = dill.load(f)
# data[0]['keywords']

def get_key_words(name):
    with open('data/{}_articles.pkl'.format(name), 'rb') as f:
        data = dill.load(f)
    key_words = []
    for article in data:
        kws = article['keywords']
        if 'trump' in kws:
            current_kws = {}
            for kw in kws:
                current_kws[kw] = 1
            key_words.append(current_kws)
    return key_words

kw_freq_dicts = []
kw_freq_dicts += get_key_words('left')
kw_freq_dicts += get_key_words('leftcenter')
kw_freq_dicts += get_key_words('right')
kw_freq_dicts += get_key_words('right-center')

In [352]:
len(kw_freq_dicts)

286

In [347]:
def get_freq_dicts(name):
    with open('data/{}_articles.pkl'.format(name), 'rb') as f:
        data = dill.load(f)

    freq_dicts = []
    for article in data:
        if 'trump' in article['text']:
            word_list = re.split(r"[\s\.,\?]+",article['text'])
            freq_list = Counter(word_list).most_common()
            freq_dict = { k: v for k,v in freq_list}
            freq_dicts.append(freq_dict)
    return freq_dicts
    
freq_dicts = []
freq_dicts += get_freq_dicts('left')
freq_dicts += get_freq_dicts('leftcenter')
freq_dicts += get_freq_dicts('right')
freq_dicts += get_freq_dicts('right-center')

In [353]:
def merge_freqs(dicts):
    all_words = {}
    for d in dicts:
        for word in d:
            if word in all_words:
                all_words[word] += 1
            else:
                all_words[word] = 1
    
    return pd.DataFrame.from_dict({'word': list(all_words.keys()), 'freq': list(all_words.values())})

In [354]:
def get_vocab(freq_dicts):
    merged_df = merge_freqs(freq_dicts)
    df = merged_df.sort_values(by=['freq'], ascending=False)
    df = df.reset_index(drop=True)
    df = df[~df['word'].str.contains('\"')]
    df = df[~df['word'].str.contains('\'')]
    df = df[~df['word'].str.contains('!')]
    df = df[~df['word'].str.contains('\(')]
    df = df[~df['word'].str.contains('\)')]
    df = df[~df['word'].str.contains('-')]
    vocab = list(df.head(10000).word.values)
    return vocab

In [337]:
vocab = get_vocab(freq_dicts)

In [357]:
kw_vocab = get_vocab(kw_freq_dicts)

In [338]:
def convert_to_vector(freq_dicts, vocab):
    vocab_dict = { v: i for i, v in enumerate(vocab) }
    data = np.zeros((len(freq_dicts), len(vocab)))
    for i, d in enumerate(freq_dicts):
        for k, v in d.items():
            try:
                idx = vocab_dict[k]
                data[i][idx] = v
            except:
                pass
        data[i] = data[i] / np.sum(data[i])
    data = data[~np.isnan(data).any(axis=1)]
    return data

In [339]:
left_center_vec = convert_to_vector(get_freq_dicts('leftcenter'), vocab)
right_center_vec = convert_to_vector(get_freq_dicts('right-center'), vocab)
left_vec = convert_to_vector(get_freq_dicts('left'), vocab)
right_vec = convert_to_vector(get_freq_dicts('right'), vocab)

In [358]:
left_center_kw = convert_to_vector(get_key_words('leftcenter'), kw_vocab)
right_center_kw = convert_to_vector(get_key_words('right-center'), kw_vocab)
left_kw = convert_to_vector(get_key_words('left'), kw_vocab)
right_kw = convert_to_vector(get_key_words('right'), kw_vocab)

In [340]:
X = np.vstack((left_center_vec, right_center_vec))
y = np.hstack((np.zeros(len(left_center_vec)), np.ones(len(right_center_vec))))

# X = np.vstack((left_vec, right_vec))
# y = np.hstack((np.zeros(len(left_vec)), np.ones(len(right_vec))))

In [363]:
# X = np.vstack((left_center_kw, right_center_kw))
# y = np.hstack((np.zeros(len(left_center_kw)), np.ones(len(right_center_kw))))

X = np.vstack((left_kw, right_kw))
y = np.hstack((np.zeros(len(left_kw)), np.ones(len(right_kw))))

In [364]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [365]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [367]:
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred, target_names=['left', 'right']))

              precision    recall  f1-score   support

        left       0.00      0.00      0.00        65
       right       0.55      1.00      0.71        79

   micro avg       0.55      0.55      0.55       144
   macro avg       0.27      0.50      0.35       144
weighted avg       0.30      0.55      0.39       144



  'precision', 'predicted', average, warn_for)


In [368]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['left', 'right']))

              precision    recall  f1-score   support

        left       0.00      0.00      0.00        19
       right       0.47      1.00      0.64        17

   micro avg       0.47      0.47      0.47        36
   macro avg       0.24      0.50      0.32        36
weighted avg       0.22      0.47      0.30        36



  'precision', 'predicted', average, warn_for)


In [329]:
rank = np.argsort(clf.coef_)[0]
# for i in rank[:20]:
#     print(kw_vocab[i])

In [331]:
merge_freqs(get_key_words('left')).sort_values(by=['freq'], ascending=False).head(20)

Unnamed: 0,word,freq
7,trump,84
12,president,27
21,trumps,16
69,shutdown,15
71,workers,13
53,wall,12
313,economy,12
52,border,11
50,national,11
263,donald,11


In [332]:
merge_freqs(get_key_words('right')).sort_values(by=['freq'], ascending=False).head(20)

Unnamed: 0,word,freq
9,trump,96
24,president,50
19,trumps,22
271,millennials,19
76,white,12
18,american,10
3,state,10
78,house,10
171,jobs,9
79,tax,9


In [314]:
for i in rank[::-1][:10]:
    print(kw_vocab[i])

washington
shows
test
wwwwashingtontimescom
quiz
challenge
sharescan
songs
remember
television
