# Text NN classifier for Political Bias

In [1]:
%load_ext autoreload

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from sklearn.utils import shuffle
from torchsample.initializers import XavierUniform, Uniform
from torchsample.modules import ModuleTrainer
from torchsample.metrics import CategoricalAccuracy

%aimport torchsample.modules



In [2]:
use_cuda = False
batch_size = 64

In [3]:
df = pd.read_csv("data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.sample(30000)

103262


Compute the bias for each one of the articles, based on the publication's known bias

In [4]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
10977,10977,2016-04-06T03:00:00.000+03:00,washingtonpost.com,By Jorge Castillo April 6 at 6:59 PM Follow @j...,Wizards vs. Nets: Game 77 discussion thread,https://www.washingtonpost.com/news/wizards-in...,2
48175,48175,2017-01-11T23:27:00.000+02:00,allenbwest.com,Proving they’re not giving up identity politic...,What Ted Cruz just did spells DISASTER for DNC...,http://www.allenbwest.com/matt-palumbo/ted-cru...,5


Tokenize the text of the articles, create a vocabulary of words

In [5]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words_for_docs(docs):
    return [get_words_ids(doc) for doc in docs]

def get_words_ids(doc):
    return [token.orth for token in doc if not is_invalid_token(token)]

In [6]:
df['docs'] = tokenize_text(df['text'])

In [7]:
df['words'] = get_words_for_docs(df['docs'])
df.head(1)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words
10977,10977,2016-04-06T03:00:00.000+03:00,washingtonpost.com,By Jorge Castillo April 6 at 6:59 PM Follow @j...,Wizards vs. Nets: Game 77 discussion thread,https://www.washingtonpost.com/news/wizards-in...,2,"(By, Jorge, Castillo, April, 6, at, 6:59, PM, ...","[1659, 713423, 503075, 7604, 584, 776980, 7908..."


Keep the most common words as vocabulary, replace the words out of vocabulary by the least frequently used word

In [8]:
from collections import Counter

vocab_size = 5000

def flatten(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

word_freq = Counter(flatten(df['words']))
common_words = word_freq.most_common(vocab_size)
word_freq.most_common(5)

[(501, 767748), (504, 424390), (510, 385088), (512, 340089), (506, 339955)]

In [9]:
print(nlp.vocab.strings[501], nlp.vocab.strings[510], nlp.vocab.strings[512], nlp.vocab.strings[506])

the of and a


In [10]:
vocab = {word_id: i for (i, (word_id, freq)) in enumerate(common_words)}
print(len(vocab), vocab[501], vocab[504], vocab[510])
oov_word = vocab_size - 1
oov_word

5000 0 1 2


4999

In [11]:
def get_words_ids_if_common(words):
    return [vocab.get(word, oov_word) for word in words]

df['words_with_oov'] = df.apply(lambda row: get_words_ids_if_common(row['words']), axis=1)
df['words_with_oov'].head()

10977     [234, 4999, 4999, 320, 16, 4999, 678, 621, 499...
48175     [4999, 33, 146, 28, 1094, 54, 2016, 650, 885, ...
59228     [690, 2, 181, 285, 3021, 4999, 4999, 3, 2900, ...
103119    [4999, 8, 186, 4999, 4999, 4999, 4999, 4999, 4...
47327     [1193, 2626, 3247, 4999, 159, 704, 2, 0, 1676,...
Name: words_with_oov, dtype: object

Look at the distribution of words. The longest article has 15K words, the shortest 21.

In [12]:
lens = np.array(list(map(len, df['words'])))
(lens.max(), lens.min(), lens.mean())

(22014, 17, 540.0028666666667)

Split the data for training and validation

In [13]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

27472
2528


Enforce all texts to have the same length, truncating or padding if necessary

In [14]:
seq_len = 1000

from keras.preprocessing import sequence

trn = sequence.pad_sequences(df_train['words_with_oov'], maxlen=seq_len, value=0)
test = sequence.pad_sequences(df_test['words_with_oov'], maxlen=seq_len, value=0)

trn_tensor = torch.from_numpy(trn).long()
test_tensor = torch.from_numpy(test).long()

trn

Using TensorFlow backend.


array([[   0,    0,    0, ..., 2626, 4999, 4999],
       [   0,    0,    0, ...,    9,  246, 4999],
       [   0,    0,    0, ..., 4999, 4999,  678],
       ..., 
       [   0,    0,    0, ...,   39, 1439,   59],
       [   0,    0,    0, ...,   58,   45,  891],
       [   0,    0,    0, ...,  151,    9,  228]], dtype=int32)

Prepare the tensor data for pytorch

In [15]:
labels_train_tensor = torch.from_numpy(np.array(df_train['bias']))
labels_test_tensor = torch.from_numpy(np.array(df_test['bias']))
labels_train_tensor[:3]


 2
 5
 4
[torch.LongTensor of size 3]

## Single Hidden Layer NN classifier

### Keras NN model

In [18]:
# Force use CPU
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import keras
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import Adam

In [19]:
k_model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(7, activation='softmax')])

In [20]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1000, 32)      160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 32000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           3200100     flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [21]:
from keras.utils.np_utils import to_categorical

labels_train = np.array(df_train['bias'])
labels_test = np.array(df_test['bias'])

print(labels_test[:3])
k_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=3, batch_size=batch_size)

[2 5 2]
Train on 27472 samples, validate on 2528 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x25979eb00>

Manually verify some articles. The bias for these articles should match the predicted bias below.

In [22]:
df_sample = df_test.sample(5)
df_sample

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words,words_with_oov
96411,96411,2015-12-20T02:00:00.000+02:00,chron.com,Brem beats Gut to win women's World Cup GS Jer...,Brem beats Gut to win women's World Cup GS - H...,http://www.chron.com/sports/article/Brem-posts...,2,"(Brem, beats, Gut, to, win, women, 's, World, ...","[968437, 6479, 546597, 504, 1212, 1154, 525, 2...","[4999, 4999, 4999, 1, 345, 195, 24, 624, 1861,..."
82587,82587,2016-07-20T20:02:00.000+03:00,bloomberg.com,The U.S. Justice Department is seeking to seiz...,‘Wolf of Wall Street’ Sued as U.S. Seeks 1MDB-...,http://www.bloomberg.com/news/articles/2016-07...,2,"(The, U.S., Justice, Department, is, seeking, ...","[566, 1717, 8163, 5653, 513, 5546, 504, 259054...","[11, 111, 623, 240, 7, 1349, 1, 4999, 2645, 19..."
92642,92642,2015-08-28T03:00:00.000+03:00,bloomberg.com,Type of Data * Update Needed * \nAll data chan...,BANGKOK CHAIN HOSPITAL PCL (KH:Bangkok): Owner...,http://www.bloomberg.com/research/stocks/owner...,2,"(Type, of, Data, *, Update, Needed, *, \n, All...","[755732, 510, 663156, 156974, 593302, 1125, 16...","[4999, 2, 4135, 4999, 4999, 431, 425, 997, 159..."
35882,35882,2016-02-07T16:33:00.000+02:00,c-span.org,Official Bio Marco Rubio @marcorubio January 5...,Marco Rubio | C-SPAN.org,http://www.c-span.org/person/?marcorubio,3,"(Official, Bio, Marco, Rubio, @marcorubio, Jan...","[690509, 568064, 24406, 66284, 778338, 6626, 6...","[4999, 4999, 2466, 1065, 4999, 432, 432, 111, ..."
99876,99876,2015-07-30T19:06:00.000+03:00,bloomberg.com,"Procter & Gamble Co., the world’s largest cons...",P&G Falls After Forecast Signals Sales Struggl...,http://www.bloomberg.com/news/articles/2015-07...,2,"(Procter, &, Gamble, Co., ,, the, world, ’s, l...","[432202, 597294, 384742, 501, 790, 1402, 3734,...","[4999, 4999, 3344, 0, 162, 10, 808, 3014, 1774..."


In [23]:
sample = sequence.pad_sequences(df_sample['words_with_oov'], maxlen=seq_len, value=0)
np.argmax(k_model.predict(sample), axis=0)

array([2, 3, 0, 2, 4, 3, 1])

## Site vectors

Remove the last linear classifier

In [24]:
l4 = k_model.pop()
l3 = k_model.pop()

In [25]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1000, 32)      160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 32000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           3200100     flatten_1[0][0]                  
Total params: 3,360,100
Trainable params: 3,360,100
Non-trainable params: 0
____________________________________________________________________________________________________


In [26]:
article_vectors = k_model.predict(trn)
article_vectors.shape

(27472, 100)

Generate an average vector of all the article vectors for each site

In [27]:
site_vectors = []
labels = []
for site in df_train['site'].unique():
    site_indexes = np.where(df_train["site"] == site)[0]
    if len(site_indexes) > 0:
        site_vector = np.mean(article_vectors[site_indexes, :], axis=0)
        site_vectors.append(site_vector)
        labels.append(site)
site_vectors = np.array(site_vectors)
site_vectors.shape

(57, 100)

In [28]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
site_vectors_2_dim = tsne_model.fit_transform(site_vectors)
X_proj = site_vectors_2_dim[:, 0]
Y_proj = site_vectors_2_dim[:, 1]

In [29]:
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from plotly.graph_objs import Bar, Scatter, Figure, Layout, XAxis, YAxis

In [30]:
trace = Scatter(x=X_proj, y=Y_proj, mode='markers+text', text=labels, textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)