# Text CNN classifier for Political Bias

In [1]:
%load_ext autoreload

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from sklearn.utils import shuffle
from torchsample.initializers import XavierUniform, Uniform
from torchsample.modules import ModuleTrainer
from torchsample.metrics import CategoricalAccuracy

%aimport torchsample.modules



In [2]:
use_cuda = False
batch_size = 64

In [3]:
df = pd.read_csv("../data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.sample(90000)

103262


Compute the bias for each one of the articles, based on the publication's known bias

In [4]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
72159,72159,2016-09-13T01:44:00.000+03:00,dailykos.com,35 North Carolina Sen. Richard Burr \nThe late...,Republican strategists are freaking out about ...,http://www.dailykos.com/story/2016/9/12/156932...,1
12549,12549,2016-04-13T22:50:00.000+03:00,chicagotribune.com,Lance Anderson has always worked with books. H...,"Shout Out: Lance Anderson, River Grove librari...",http://www.chicagotribune.com/suburbs/elmwood-...,2


Tokenize the text of the articles, create a vocabulary of words

In [5]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words_for_docs(docs):
    return [get_words_ids(doc) for doc in docs]

def get_words_ids(doc):
    return [token.orth for token in doc if not is_invalid_token(token)]

In [6]:
df['docs'] = tokenize_text(df['title'].astype(str))

In [7]:
df['words'] = get_words_for_docs(df['docs'])
df.head(1)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words
72159,72159,2016-09-13T01:44:00.000+03:00,dailykos.com,35 North Carolina Sen. Richard Burr \nThe late...,Republican strategists are freaking out about ...,http://www.dailykos.com/story/2016/9/12/156932...,1,"(Republican, strategists, are, freaking, out, ...","[1316, 350748, 534, 5185, 582, 581, 501, 2711,..."


Keep the most common words as vocabulary, replace the words out of vocabulary by the least frequently used word

In [8]:
from collections import Counter

vocab_size = 5000

def flatten(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

word_freq = Counter(flatten(df['words']))
common_words = word_freq.most_common(vocab_size)
word_freq.most_common(5)

[(504, 18219), (576246, 14844), (522, 12734), (566, 11576), (501, 11539)]

In [9]:
print(nlp.vocab.strings[501], nlp.vocab.strings[510], nlp.vocab.strings[512], nlp.vocab.strings[506])

the of and a


In [10]:
vocab = {word_id: i for (i, (word_id, freq)) in enumerate(common_words)}
print(len(vocab), vocab[501], vocab[504], vocab[510])
oov_word = vocab_size - 1
oov_word

5000 4 0 5


4999

In [11]:
def get_words_ids_if_common(words):
    return [vocab.get(word, oov_word) for word in words]

df['words_with_oov'] = df.apply(lambda row: get_words_ids_if_common(row['words']), axis=1)
df['words_with_oov'].head()

72159    [140, 4999, 101, 4999, 115, 90, 4, 4355, 38, 1...
12549    [4999, 74, 4999, 2556, 2357, 3190, 4999, 4999,...
46881    [188, 225, 4999, 34, 2126, 4999, 141, 88, 200,...
62012    [4999, 4999, 4999, 4999, 23, 1106, 1587, 358, ...
33069    [4999, 9, 736, 4999, 4999, 4999, 3237, 1939, 4...
Name: words_with_oov, dtype: object

Look at the distribution of words. The longest article has 15K words, the shortest 21.

In [12]:
lens = np.array(list(map(len, df['words'])))
(lens.max(), lens.min(), lens.mean())

(45, 0, 11.091799999999999)

Split the data for training and validation

In [13]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

82312
7688


Enforce all texts to have the same length, truncating or padding if necessary

In [14]:
seq_len = 50

from keras.preprocessing import sequence

trn = sequence.pad_sequences(df_train['words_with_oov'], maxlen=seq_len, value=0)
test = sequence.pad_sequences(df_test['words_with_oov'], maxlen=seq_len, value=0)

trn_tensor = torch.from_numpy(trn).long()
test_tensor = torch.from_numpy(test).long()

trn

Using TensorFlow backend.


array([[   0,    0,    0, ...,   22,   72, 1626],
       [   0,    0,    0, ..., 4711,  116,  606],
       [   0,    0,    0, ..., 1587,  358, 4999],
       ..., 
       [   0,    0,    0, ..., 4999, 4999,    8],
       [   0,    0,    0, ...,   63, 2021, 4999],
       [   0,    0,    0, ...,   61,  668, 1411]], dtype=int32)

Prepare the tensor data for pytorch

In [15]:
labels_train_tensor = torch.from_numpy(np.array(df_train['bias']))
labels_test_tensor = torch.from_numpy(np.array(df_test['bias']))
labels_train_tensor[:3]


 1
 1
 3
[torch.LongTensor of size 3]

## Single Hidden Layer CNN classifier

Manually verify some articles. The bias for these articles should match the predicted bias below.

In [16]:
df_sample = df_test.sample(5)
df_sample

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words,words_with_oov
31167,31167,2015-05-05T03:00:00.000+03:00,chicagotribune.com,Why you should keep things weird at work Capti...,Why you should keep things weird at work - Chi...,http://www.chicagotribune.com/bluesky/hub/ct-i...,2,"(Why, you, should, keep, things, weird, at, wo...","[901, 520, 650, 936, 753, 2426, 584, 747, 3639...","[66, 154, 347, 908, 875, 4999, 17, 804, 128, 217]"
23044,23044,2015-02-28T05:35:00.000+02:00,bloomberg.com,"(Bloomberg) -- Boris Nemtsov, a Russian opposi...",Russian Opposition Member Boris Nemtsov Was Ki...,http://www.bloomberg.com/news/articles/2015-02...,2,"(Russian, Opposition, Member, Boris, Nemtsov, ...","[4087, 474971, 48179, 118470, 793677, 2750, 75...","[205, 3042, 1990, 4999, 4999, 89, 508, 2, 4397]"
82709,82709,2016-07-21T04:13:00.000+03:00,c-span.org,See all on Republican National Convention Augu...,Scott Walker 2012 Republican National Conventi...,http://www.c-span.org/video/?c3830448/scott-wa...,3,"(Scott, Walker, 2012, Republican, National, Co...","[8681, 456440, 1316, 3341, 589956, 2329, 68578...","[506, 1138, 140, 210, 521, 8, 2396, 2597, 8, 97]"
35891,35891,2016-02-11T01:19:00.000+02:00,bloomberg.com,Mark Halperin and John Heilemann discuss Donal...,Who’s Best Positioned to Stop Donald Trump?,http://www.bloomberg.com/politics/videos/2016-...,2,"(Who, ’s, Best, Positioned, to, Stop, Donald, ...","[1318, 1402, 3622, 654708, 504, 2948, 388386, ...","[70, 6, 419, 4999, 0, 294, 38, 1]"
10189,10189,2015-04-01T03:00:00.000+03:00,chicagotribune.com,0 \nClinical psychologist and relationship coa...,"Tamsen Fadal, &apos;Girlfriends&apos; Guide&ap...",http://www.chicagotribune.com/lifestyles/sc-fa...,2,"(Tamsen, Fadal, ,, &, apos;Girlfriends&apos, ;...","[785983, 785985, 785987, 785989, 6708, 231964,...","[4999, 4999, 4999, 4999, 4999, 4799, 2, 10, 15..."


## Keras CNN model

In [17]:
# Force use CPU
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import keras
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import Adam

In [18]:
k_model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(7, activation='softmax')])

In [19]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 50, 32)        160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 1600)          0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           160100      flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [20]:
from keras.utils.np_utils import to_categorical

labels_train = np.array(df_train['bias'])
labels_test = np.array(df_test['bias'])

print(labels_test[:3])
k_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=3, batch_size=batch_size)

[2 2 5]
Train on 82312 samples, validate on 7688 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x198776e10>

Manually verify some articles. The bias for these articles should match the predicted bias below.

In [21]:
df_sample = df_test.sample(5)
df_sample

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words,words_with_oov
67538,67538,2017-02-12T02:00:00.000+02:00,breitbart.com,Breitbart News Daily: Repeal and Replace by Br...,Breitbart News Daily: Repeal and Replace,http://www.breitbart.com/radio/2017/02/12/brei...,5,"(Breitbart, News, Daily, :, Repeal, and, Replace)","[777136, 2548, 4940, 41725, 512, 692490]","[178, 35, 32, 1136, 12, 2632]"
39510,39510,2017-05-08T10:57:00.000+03:00,bloomberg.com,Macau’s government is requiring facial recogni...,Macau Introduces ATM Facial Recognition to Det...,http://www.bloomberg.com/news/articles/2017-05...,2,"(Macau, Introduces, ATM, Facial, Recognition, ...","[761142, 104556, 6385, 532880, 12203, 504, 496...","[4999, 4999, 4999, 4999, 4999, 0, 4999, 770, 4..."
12448,12448,2016-04-08T18:10:00.000+03:00,chron.com,Oklahoma man arrested in New Mexico after meth...,Oklahoma man arrested in New Mexico after meth...,http://www.chron.com/news/crime/article/Oklaho...,2,"(Oklahoma, man, arrested, in, New, Mexico, aft...","[9472, 852, 4220, 522, 1571, 4268, 805, 7574, ...","[1383, 183, 587, 2, 18, 360, 55, 4999, 490, 2,..."
42371,42371,2015-06-28T03:00:00.000+03:00,breitbart.com,"Andrew Cuomo Officiates Gay Wedding, Demands P...","Andrew Cuomo Officiates Gay Wedding, Demands P...",http://www.breitbart.com/big-government/2015/0...,5,"(Andrew, Cuomo, Officiates, Gay, Wedding, ,, D...","[116732, 291864, 790082, 8676, 333553, 510455,...","[2122, 1557, 4999, 290, 3812, 1727, 4422, 7, 8..."
52904,52904,2016-11-16T02:25:00.000+02:00,breitbart.com,"by Pam Key 15 Nov 2016 0 15 Nov, 2016 15 Nov, ...",Blasts 'Troubling' Rhetoric of Right Media...,http://www.breitbart.com/video/2016/11/15/obam...,5,"(Blasts, ', Troubling, ', Rhetoric, of, Right,...","[511195, 82282, 406659, 510, 2042, 7779]","[1386, 4999, 4999, 5, 164, 95]"


In [22]:
sample = sequence.pad_sequences(df_sample['words_with_oov'], maxlen=seq_len, value=0)
np.argmax(k_model.predict(sample), axis=0)

array([3, 3, 2, 1, 4, 4, 0])

## Site vectors

Remove the last linear classifier

In [23]:
l4 = k_model.pop()
l3 = k_model.pop()

In [24]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 50, 32)        160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 1600)          0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           160100      flatten_1[0][0]                  
Total params: 320,100
Trainable params: 320,100
Non-trainable params: 0
____________________________________________________________________________________________________


In [25]:
article_vectors = k_model.predict(trn)
article_vectors.shape

(82312, 100)

Generate an average vector of all the article vectors for each site

In [26]:
site_vectors = []
labels = []
for site in df_train['site'].unique():
    site_indexes = np.where(df_train["site"] == site)[0]
    if len(site_indexes) > 0:
        site_vector = np.mean(article_vectors[site_indexes, :], axis=0)
        site_vectors.append(site_vector)
        labels.append(site)
site_vectors = np.array(site_vectors)
site_vectors.shape

(57, 100)

In [27]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
site_vectors_2_dim = tsne_model.fit_transform(site_vectors)
X_proj = site_vectors_2_dim[:, 0]
Y_proj = site_vectors_2_dim[:, 1]

In [28]:
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from plotly.graph_objs import Bar, Scatter, Figure, Layout, XAxis, YAxis

In [29]:
trace = Scatter(x=X_proj, y=Y_proj, mode='markers+text', text=labels, textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)

### PCA Analysis

In [33]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
site_vectors_pca = pca.fit(site_vectors).components_

In [34]:
trace = Scatter(x=site_vectors_pca[0], y=site_vectors_pca[1], mode='markers+text', text=site_vectors_pca[0], textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)