# Text NN classifier for Political Bias

In [1]:
import pandas as pd
import numpy as np

batch_size = 64

In [2]:
df = pd.read_csv("../data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.sample(30000)

103262


Compute the bias for each one of the articles, based on the publication's known bias

In [3]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
510,510,2015-01-10T04:27:00.000+02:00,latimes.com,'Diving Normal' explores deep emotional waters...,'Diving Normal' explores deep emotional waters,http://www.latimes.com/entertainment/movies/la...,2
51913,51913,2016-11-15T19:20:00.000+02:00,allenbwest.com,Is personal responsibility dead? More often th...,UNREAL: Look what liberals are blaming on Trum...,http://www.allenbwest.com/analytical-economist...,5


Tokenize the text of the articles, create a vocabulary of words

In [4]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words_for_docs(docs):
    return [get_words_ids(doc) for doc in docs]

def get_words_ids(doc):
    return [token.orth for token in doc if not is_invalid_token(token)]

In [5]:
df['docs'] = tokenize_text(df['text'])

In [6]:
df['words'] = get_words_for_docs(df['docs'])
df.head(1)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words
510,510,2015-01-10T04:27:00.000+02:00,latimes.com,'Diving Normal' explores deep emotional waters...,'Diving Normal' explores deep emotional waters,http://www.latimes.com/entertainment/movies/la...,2,"(', Diving, Normal, ', explores, deep, emotion...","[583882, 446008, 119874, 2693, 4224, 58909, 58..."


Keep the most common words as vocabulary, replace the words out of vocabulary by the least frequently used word

In [7]:
from collections import Counter

vocab_size = 5000

def flatten(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

word_freq = Counter(flatten(df['words']))
common_words = word_freq.most_common(vocab_size)
word_freq.most_common(5)

[(501, 758955), (504, 420642), (510, 381346), (512, 338364), (506, 337322)]

In [8]:
print(nlp.vocab.strings[501], nlp.vocab.strings[510], nlp.vocab.strings[512], nlp.vocab.strings[506])

the of and a


In [9]:
vocab = {word_id: i for (i, (word_id, freq)) in enumerate(common_words)}
print(len(vocab), vocab[501], vocab[504], vocab[510])
oov_word = vocab_size - 1
oov_word

5000 0 1 2


4999

In [10]:
def get_words_ids_if_common(words):
    return [vocab.get(word, oov_word) for word in words]

df['words_with_oov'] = df.apply(lambda row: get_words_ids_if_common(row['words']), axis=1)
df['words_with_oov'].head()

510       [4999, 4999, 4999, 1455, 3235, 4568, 4999, 499...
51913     [648, 686, 1714, 991, 484, 461, 63, 28, 14, 10...
68980     [4999, 29, 2706, 341, 0, 996, 2, 0, 4275, 1119...
18984     [531, 4999, 9, 4999, 0, 3558, 2, 4999, 3, 4646...
104526    [4561, 4999, 889, 3, 375, 4999, 5, 4999, 346, ...
Name: words_with_oov, dtype: object

Look at the distribution of words. The longest article has 15K words, the shortest 21.

In [11]:
lens = np.array(list(map(len, df['words'])))
(lens.max(), lens.min(), lens.mean())

(20532, 16, 535.67420000000004)

Split the data for training and validation

In [12]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

27514
2486


Enforce all texts to have the same length, truncating or padding if necessary

In [13]:
seq_len = 1000

from keras.preprocessing import sequence

trn = sequence.pad_sequences(df_train['words_with_oov'], maxlen=seq_len, value=0)
test = sequence.pad_sequences(df_test['words_with_oov'], maxlen=seq_len, value=0)

trn

Using TensorFlow backend.


array([[   0,    0,    0, ..., 1617,   60,   14],
       [   0,    0,    0, ...,   11, 4999, 4999],
       [   0,    0,    0, ...,   54,   15, 4999],
       ..., 
       [   0,    0,    0, ..., 4465,    9,  223],
       [  47, 4999,    8, ..., 2503,  592,  223],
       [   0,    0,    0, ...,    1,  366, 2930]], dtype=int32)

## Single Hidden Layer NN classifier

### Keras NN model

In [14]:
# Force use CPU
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import keras
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import Adam

In [15]:
k_model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(7, activation='softmax')])

In [16]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1000, 32)      160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 32000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           3200100     flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [17]:
from keras.utils.np_utils import to_categorical

labels_train = np.array(df_train['bias'])
labels_test = np.array(df_test['bias'])

print(labels_test[:3])
k_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=3, batch_size=batch_size)

[2 5 2]
Train on 27514 samples, validate on 2486 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x22e0a6e48>

Manually verify some articles. The bias for these articles should match the predicted bias below.

In [18]:
df_sample = df_test.sample(5)
df_sample

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words,words_with_oov
34411,34411,2016-02-11T20:57:00.000+02:00,bloomberg.com,Share on LinkedIn Share on Reddit Share on Goo...,Bloomberg Business,http://www.bloomberg.com/news/audio/2016-02-11...,2,"(Share, on, LinkedIn, Share, on, Reddit, Share...","[191732, 542, 777508, 191732, 542, 1230, 19173...","[439, 9, 4999, 439, 9, 4999, 439, 9, 4999, 403..."
68464,68464,2017-02-23T12:07:00.000+02:00,bloomberg.com,Is the Worst Over for Singapore Property Build...,Is the Worst Over for Singapore Property Build...,https://www.bloomberg.com/news/videos/2017-02-...,2,"(Is, the, Worst, Over, for, Singapore, Propert...","[1001, 501, 589508, 5160, 531, 694585, 523361,...","[648, 0, 4999, 1536, 8, 4999, 4999, 4999, 4999..."
34938,34938,2016-02-17T00:57:00.000+02:00,chicagotribune.com,Aurora man charged with home invasion Beacon-N...,Aurora man charged with home invasion,http://www.chicagotribune.com/suburbs/aurora-b...,2,"(Aurora, man, charged, with, home, invasion, B...","[642360, 852, 3772, 548, 1095, 5492, 738235, 2...","[4999, 217, 1002, 13, 207, 4999, 4999, 182, 63..."
43342,43342,2015-06-01T03:00:00.000+03:00,chron.com,"Woman held on charges she robbed dollar store,...","Woman held on charges she robbed dollar store,...",http://www.chron.com/news/crime/article/Woman-...,2,"(Woman, held, on, charges, she, robbed, dollar...","[154911, 2465, 542, 3830, 699, 8997, 1633, 213...","[4999, 502, 9, 844, 61, 4999, 1747, 1477, 651,..."
89723,89723,2015-11-16T02:00:00.000+02:00,breitbart.com,Speaker Paul Ryan: Congress May Cut Funding Fr...,Speaker Paul Ryan: Congress May Cut Funding Fr...,http://www.breitbart.com/big-government/2015/1...,5,"(Speaker, Paul, Ryan, :, Congress, May, Cut, F...","[284569, 1469, 634801, 2242, 4323, 9497, 43769...","[2093, 547, 641, 394, 268, 4999, 4999, 488, 10..."


In [19]:
sample = sequence.pad_sequences(df_sample['words_with_oov'], maxlen=seq_len, value=0)
np.argmax(k_model.predict(sample), axis=0)

array([0, 0, 2, 1, 1, 4, 2])

## Site vectors

Remove the last linear classifier

In [20]:
l4 = k_model.pop()
l3 = k_model.pop()

In [21]:
k_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
k_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1000, 32)      160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 32000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           3200100     flatten_1[0][0]                  
Total params: 3,360,100
Trainable params: 3,360,100
Non-trainable params: 0
____________________________________________________________________________________________________


In [22]:
article_vectors = k_model.predict(trn)
article_vectors.shape

(27514, 100)

Generate an average vector of all the article vectors for each site

In [23]:
site_vectors = []
labels = []
for site in df_train['site'].unique():
    site_indexes = np.where(df_train["site"] == site)[0]
    if len(site_indexes) > 0:
        site_vector = np.mean(article_vectors[site_indexes, :], axis=0)
        site_vectors.append(site_vector)
        labels.append(site)
site_vectors = np.array(site_vectors)
site_vectors.shape

(57, 100)

In [24]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
site_vectors_2_dim = tsne_model.fit_transform(site_vectors)
X_proj = site_vectors_2_dim[:, 0]
Y_proj = site_vectors_2_dim[:, 1]

In [25]:
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from plotly.graph_objs import Bar, Scatter, Figure, Layout, XAxis, YAxis

In [26]:
trace = Scatter(x=X_proj, y=Y_proj, mode='markers+text', text=labels, textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)