# Title NN classifier for Political Bias using bag of words

In [74]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix

import seaborn as sn
import matplotlib.pyplot as plt

batch_size = 64

In [2]:
df = pd.read_csv("../data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.sample(90000).reset_index(drop=True)

103262


Compute the bias for each one of the articles, based on the publication's known bias

In [3]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
0,21014,2015-03-05T19:51:00.000+02:00,washingtonexaminer.com,A closer look at sick leave By Sean Higgins | ...,A closer look at sick leave,http://www.washingtonexaminer.com/a-closer-loo...,4
1,52696,2016-11-07T20:30:00.000+02:00,bloomberg.com,What to Do About Russia’s Hacking \nWeaker tha...,What to Do About Russia’s Hacking,https://www.bloomberg.com/view/articles/2016-1...,2


Split the data for training and validation

In [4]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

82297
7703


Tokenize the text of the articles, create a vocabulary of words

In [5]:
import textacy
from textacy import vsm

def get_terms_list(corpus):
    return (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)

def get_doc_term_matrix(corpus):
    vectorizer = vsm.Vectorizer(
                     weighting='tfidf', normalize=True, smooth_idf=True,
                     min_df=3, max_df=0.95, max_n_terms=100000)
    terms_list = get_terms_list(corpus)
    doc_term_matrix = vectorizer.fit_transform(terms_list)
    return vectorizer, terms_list, doc_term_matrix

In [6]:
corpus_trn = textacy.Corpus('en', texts=df_train['title'].astype(str).__iter__())
corpus_test = textacy.Corpus('en', texts=df_test['title'].astype(str).__iter__())

In [7]:
vectorizer, terms_list, doc_term_matrix_trn = get_doc_term_matrix(corpus_trn)
doc_term_matrix_test = vectorizer.transform(get_terms_list(corpus_test))
term_count = doc_term_matrix_trn.shape[1]
print(doc_term_matrix_trn.shape, doc_term_matrix_test.shape)

(82297, 19399) (7703, 19399)


<82297x19399 sparse matrix of type '<class 'numpy.float64'>'
	with 666649 stored elements in Compressed Sparse Row format>

In [13]:
trn = doc_term_matrix_trn.todense()
test = doc_term_matrix_test.todense()
trn

matrix([[ 0.52813461,  0.4330626 ,  0.55988023, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

## Single Hidden Layer NN classifier

### Keras NN model

In [69]:
# Force use CPU
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import keras
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers.core import Flatten, Dense, Dropout, Activation
from keras.optimizers import Adam

In [70]:
k_model = Sequential([
    Dense(25, input_shape=(term_count,), activation='relu'),
    # Dense(50, activation='relu'),
    Dropout(0.3),
    Dense(7, activation='softmax')])

In [71]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
# k_model.summary()

In [72]:
from keras.utils.np_utils import to_categorical

labels_train = np.array(df_train['bias'])
labels_test = np.array(df_test['bias'])

k_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=batch_size)

Train on 82297 samples, validate on 7703 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x4c7a39780>

In [39]:
# This actually reduces the validation accuracy
# Train on 82297 samples, validate on 7703 samples
# Epoch 1/1
# 82297/82297 [==============================] - 46s - loss: 1.0174 - acc: 0.6303 - val_loss: 1.4829 - val_acc: 0.4362
#
# k_model.optimizer.lr.assign(0.0001)
# k_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=1, batch_size=batch_size)

Manually verify some articles. The bias for these articles should match the predicted bias below.

In [67]:
df_sample = df_test.sample(5)
df_sample[['site', 'title', 'bias']]

Unnamed: 0,site,title,bias
24342,c-span.org,Katrina 10 Years Later Representative Stephani...,3
1679,breitbart.com,Stephen A. Smith on Kurt Busch: Double Standar...,5
5705,c-span.org,C-SPAN TV Schedule | C-SPAN.org,3
7810,bloomberg.com,GM China Venture Said to Be Under Government A...,2
25719,chron.com,Wednesday's Sports Transactions - Houston Chro...,2


In [76]:
predictions = np.argmax(k_model.predict(test), axis=1)
print("precision = ", np.sum(predictions == labels_test) * 100.0 / len(labels_test))
corpus_sample = textacy.Corpus('en', texts=df_sample['title'].astype(str).__iter__())
sample = vectorizer.transform(get_terms_list(corpus_sample)).todense()
df_prediction, df_prediction['bias'] = df_sample[['site', 'title']], np.argmax(k_model.predict(sample), axis=1)
df_prediction

precision =  47.0466052187


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,site,title,bias
24342,c-span.org,Katrina 10 Years Later Representative Stephani...,5
1679,breitbart.com,Stephen A. Smith on Kurt Busch: Double Standar...,1
5705,c-span.org,C-SPAN TV Schedule | C-SPAN.org,3
7810,bloomberg.com,GM China Venture Said to Be Under Government A...,4
25719,chron.com,Wednesday's Sports Transactions - Houston Chro...,2


In [80]:
cm = confusion_matrix(labels_test, predictions)
cm

array([[   0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0],
       [   2,  503, 2848,  705,  565,  429,   29],
       [   0,  470,   74,  206,   29,  247,    3],
       [   0,    0,    0,    0,    0,    0,    0],
       [   0,  573,  264,   42,   64,  570,   80],
       [   0,    0,    0,    0,    0,    0,    0]])

In [81]:
sn.heatmap(cm, annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x11dbbc0b8>