# Title NN classifier for Political Bias

In [1]:
import pandas as pd
import numpy as np

batch_size = 64

In [2]:
df = pd.read_csv("../data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.sample(90000)

103262


Compute the bias for each one of the articles, based on the publication's known bias

In [3]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
41137,41137,2017-05-10T07:50:00.000+03:00,politicususa.com,"In his statement on the firing of James Comey,...",Bernie Sanders Sets The Stage For A Massive De...,http://www.politicususa.com/2017/05/09/bernie-...,1
27585,27585,2017-03-09T02:00:00.000+02:00,sfchronicle.com,"Photo: Scott Strazzante, The Chronicle Golden ...","Stephen Curry’s great, but his contemporaries ...",http://www.sfchronicle.com/sports/jenkins/arti...,2


Tokenize the text of the articles, create a vocabulary of words

In [4]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

def is_invalid_token(token):
    return token.like_url or token.like_num or token.is_digit

def get_words_for_docs(docs):
    return [get_words_ids(doc) for doc in docs]

def get_words_ids(doc):
    return [token.orth for token in doc if not is_invalid_token(token)]

In [5]:
df['docs'] = tokenize_text(df['title'].astype(str))

In [6]:
df['words'] = get_words_for_docs(df['docs'])
df.head(1)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words
41137,41137,2017-05-10T07:50:00.000+03:00,politicususa.com,"In his statement on the firing of James Comey,...",Bernie Sanders Sets The Stage For A Massive De...,http://www.politicususa.com/2017/05/09/bernie-...,1,"(Bernie, Sanders, Sets, The, Stage, For, A, Ma...","[231914, 508088, 359600, 566, 339716, 1011, 68..."


Keep the most common words as vocabulary, replace the words out of vocabulary by the least frequently used word

In [7]:
from collections import Counter

vocab_size = 100000

def flatten(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

word_freq = Counter(flatten(df['words']))
common_words = word_freq.most_common(vocab_size)
word_freq.most_common(5)

[(535, 22785), (454, 21641), (450, 18417), (504, 18233), (576246, 14817)]

In [8]:
print(nlp.vocab.strings[501], nlp.vocab.strings[510], nlp.vocab.strings[512], nlp.vocab.strings[506])

the of and a


In [9]:
vocab = {word_id: i for (i, (word_id, freq)) in enumerate(common_words)}
print(len(vocab), vocab[501], vocab[504], vocab[510])
oov_word = vocab_size - 1
oov_word

66196 7 3 9


99999

In [10]:
def get_words_ids_if_common(words):
    return [vocab.get(word, oov_word) for word in words]

df['words_with_oov'] = df.apply(lambda row: get_words_ids_if_common(row['words']), axis=1)
df['words_with_oov'].head()

41137     [180, 161, 3308, 8, 1604, 29, 21, 852, 313, 35...
27585     [1063, 3852, 10, 1580, 2, 160, 138, 59758, 195...
57187     [135, 10, 7, 1000, 507, 64, 6328, 199, 215, 48...
10115     [166, 122, 4112, 3, 331, 11, 87, 827, 2538, 22...
100500    [40, 62583, 40, 63, 1998, 1025, 8034, 49, 1357...
Name: words_with_oov, dtype: object

Look at the distribution of words. The longest article has 15K words, the shortest 21.

In [11]:
lens = np.array(list(map(len, df['words'])))
(lens.max(), lens.min(), lens.mean())

(51, 0, 12.453033333333334)

Split the data for training and validation

In [12]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

82337
7663


Enforce all texts to have the same length, truncating or padding if necessary

In [13]:
seq_len = 50

from keras.preprocessing import sequence

trn = sequence.pad_sequences(df_train['words_with_oov'], maxlen=seq_len, value=0)
test = sequence.pad_sequences(df_test['words_with_oov'], maxlen=seq_len, value=0)

trn

Using TensorFlow backend.


array([[    0,     0,     0, ...,    10,   176,   832],
       [    0,     0,     0, ...,   195, 15820,  1095],
       [    0,     0,     0, ...,    12,     8,   139],
       ..., 
       [    0,     0,     0, ...,  2993,   322,  1933],
       [    0,     0,     0, ...,  1037,  2628,   696],
       [    0,     0,     0, ...,   292,    15,  7685]], dtype=int32)

## Single Hidden Layer NN classifier

### Keras NN model

In [14]:
# Force use CPU
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import keras
from keras.models import Sequential, Model
from keras.layers import Embedding, Convolution1D, MaxPooling1D
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import Adam

In [15]:
k_model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(50, activation='relu'),
    Dropout(0.3),
    Dense(7, activation='softmax')])

In [16]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
#k_model.summary()

In [17]:
from keras.utils.np_utils import to_categorical

labels_train = np.array(df_train['bias'])
labels_test = np.array(df_test['bias'])

k_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=batch_size)

Train on 82337 samples, validate on 7663 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x176e7aba8>

Manually verify some articles. The bias for these articles should match the predicted bias below.

In [18]:
df_sample = df_test.sample(5)
df_sample[['site', 'title', 'bias']]

Unnamed: 0,site,title,bias
52171,chron.com,Winning numbers drawn in 'Roadrunner Cash' gam...,2
19632,chron.com,Iowa softball coach convicted of 4 counts of s...,2
97701,chron.com,Newark schools reach agreement to halt federal...,2
52393,chicagotribune.com,City to hold Cubs World Series parade Friday -...,2
92919,chicagotribune.com,Northbrook man dies of injuries sustained in d...,2


In [21]:
sample = sequence.pad_sequences(df_sample['words_with_oov'], maxlen=seq_len, value=0)
print("precision = ", np.sum(np.argmax(k_model.predict(test), axis=1) == labels_test) * 100.0 / len(labels_test))
df_prediction, df_prediction['bias'] = df_sample[['site', 'title']], np.argmax(k_model.predict(sample), axis=1)
df_prediction

precision =  56.857627561


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,site,title,bias
52171,chron.com,Winning numbers drawn in 'Roadrunner Cash' gam...,2
19632,chron.com,Iowa softball coach convicted of 4 counts of s...,2
97701,chron.com,Newark schools reach agreement to halt federal...,2
52393,chicagotribune.com,City to hold Cubs World Series parade Friday -...,2
92919,chicagotribune.com,Northbrook man dies of injuries sustained in d...,2


In [34]:
conv1 = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, dropout=0.2),
    Dropout(0.2),
    Convolution1D(32, 5, border_mode='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(50, activation='relu'),
    Dropout(0.3),
    Dense(7, activation='softmax')])

conv1.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
# conv1.summary()

In [35]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=3, batch_size=batch_size)


Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Train on 82337 samples, validate on 7663 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1858065c0>

In [37]:
print("precision = ", np.sum(np.argmax(conv1.predict(test), axis=1) == labels_test) * 100.0 / len(labels_test))
df_prediction, df_prediction['bias'] = df_sample[['site', 'title']], np.argmax(conv1.predict(sample), axis=1)
df_prediction

precision =  53.4907999478




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,site,title,bias
52171,chron.com,Winning numbers drawn in 'Roadrunner Cash' gam...,2
19632,chron.com,Iowa softball coach convicted of 4 counts of s...,2
97701,chron.com,Newark schools reach agreement to halt federal...,2
52393,chicagotribune.com,City to hold Cubs World Series parade Friday -...,2
92919,chicagotribune.com,Northbrook man dies of injuries sustained in d...,2


## Site vectors

Remove the last linear classifier

In [25]:
l4 = k_model.pop()
l3 = k_model.pop()

In [26]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 50, 32)        3200000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 1600)          0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 50)            80050       flatten_1[0][0]                  
Total params: 3,280,050
Trainable params: 3,280,050
Non-trainable params: 0
____________________________________________________________________________________________________


In [27]:
article_vectors = k_model.predict(trn)
article_vectors.shape

(82337, 50)

Generate an average vector of all the article vectors for each site

In [28]:
site_vectors = []
labels = []
sample_size = 1000
for site in df_train['site'].unique():
    site_indexes = np.where(df_train["site"] == site)[0]
    if len(site_indexes) > 0:
        site_vector = np.mean(article_vectors[site_indexes[:sample_size], :], axis=0)
        site_vectors.append(site_vector)
        labels.append(site)
site_vectors = np.array(site_vectors)
site_vectors.shape

(57, 50)

In [29]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
site_vectors_2_dim = tsne_model.fit_transform(site_vectors)
X_proj = site_vectors_2_dim[:, 0]
Y_proj = site_vectors_2_dim[:, 1]

In [30]:
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from plotly.graph_objs import Bar, Scatter, Figure, Layout, XAxis, YAxis

In [31]:
trace = Scatter(x=X_proj, y=Y_proj, mode='markers+text', text=labels, textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)

### PCA Analysis

In [32]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
site_vectors_pca = pca.fit(site_vectors).components_

In [33]:
trace = Scatter(x=site_vectors_pca[0], y=site_vectors_pca[1], mode='markers+text', text=labels, textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)