In [1]:
%load_ext autoreload

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from sklearn.utils import shuffle
from torchsample.initializers import XavierUniform, Uniform
from torchsample.modules import ModuleTrainer
from torchsample.metrics import CategoricalAccuracy

%aimport torchsample.modules



In [2]:
use_cuda = False
batch_size = 64

In [3]:
df = pd.read_csv("../data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
# df = df.sample(3000)
df = df.sample(30000)

103262


In [4]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
42862,42862,2015-06-16T03:00:00.000+03:00,chron.com,Governor signs bill on $25 withdrawal for welf...,Governor signs bill on $25 withdrawal for welf...,http://www.chron.com/news/article/Governor-sig...,2
75576,75576,2016-09-22T03:00:00.000+03:00,wsj.com,When we wrote last month that Margrethe Vestag...,Vestager Gets Vindictive,http://online.wsj.com/articles/vestager-gets-v...,4


In [5]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words_for_docs(docs):
    return [get_words_ids(doc) for doc in docs]

def get_words_ids(doc):
    return [token.orth for token in doc if not is_invalid_token(token)]

In [6]:
df['docs'] = tokenize_text(df['text'])

In [7]:
df['words'] = get_words_for_docs(df['docs'])
df.head(1)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words
42862,42862,2015-06-16T03:00:00.000+03:00,chron.com,Governor signs bill on $25 withdrawal for welf...,Governor signs bill on $25 withdrawal for welf...,http://www.chron.com/news/article/Governor-sig...,2,"(Governor, signs, bill, on, $, 25, withdrawal,...","[5916, 3245, 1734, 542, 448, 447933, 531, 3549..."


In [8]:
from collections import Counter

vocab_size = 5000

def flatten(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

word_freq = Counter(flatten(df['words']))
common_words = word_freq.most_common(vocab_size)
word_freq.most_common(5)

[(501, 764843), (504, 422008), (510, 382700), (506, 338671), (512, 338549)]

In [9]:
print(nlp.vocab.strings[501], nlp.vocab.strings[510], nlp.vocab.strings[512], nlp.vocab.strings[506])

the of and a


In [10]:
vocab = {word_id: i for (i, (word_id, freq)) in enumerate(common_words)}
print(len(vocab), vocab[501], vocab[504], vocab[510])
oov_word = vocab_size - 1
oov_word

5000 0 1 2


4999

In [11]:
def get_words_ids_if_common(words):
    return [vocab.get(word, oov_word) for word in words]

df['words_with_oov'] = df.apply(lambda row: get_words_ids_if_common(row['words']), axis=1)
df['words_with_oov'].head()

42862    [1764, 1754, 410, 9, 79, 4999, 8, 3318, 4999, ...
75576    [243, 43, 378, 104, 325, 6, 4999, 4999, 12, 24...
36282    [4999, 4, 0, 4999, 318, 2, 181, 3152, 4999, 49...
1020     [4999, 4999, 2, 53, 4325, 356, 91, 1, 4999, 13...
20457    [258, 4999, 1262, 2393, 651, 874, 178, 249, 49...
Name: words_with_oov, dtype: object

In [12]:
lens = np.array(list(map(len, df['words'])))
(lens.max(), lens.min(), lens.mean())

(15046, 21, 538.34870000000001)

In [13]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

27383
2617


In [14]:
seq_len = 1000

from keras.preprocessing import sequence

trn = sequence.pad_sequences(df_train['words_with_oov'], maxlen=seq_len, value=0)
test = sequence.pad_sequences(df_test['words_with_oov'], maxlen=seq_len, value=0)

trn_tensor = torch.from_numpy(trn).long()
test_tensor = torch.from_numpy(test).long()

trn

Using TensorFlow backend.


array([[   0,    0,    0, ...,   38, 1391,   59],
       [   0,    0,    0, ...,  153,    0, 1079],
       [   0,    0,    0, ...,   31, 4999,  626],
       ..., 
       [   0,    0,    0, ...,   71,  456,  604],
       [   0,    0,    0, ..., 4999, 1591, 4999],
       [   0,    0,    0, ..., 4999,  453,   29]], dtype=int32)

In [15]:
labels_train_tensor = torch.from_numpy(np.array(df_train['bias']))
labels_test_tensor = torch.from_numpy(np.array(df_test['bias']))
labels_train_tensor[:3]


 4
 3
 2
[torch.LongTensor of size 3]

In [16]:
import torch.nn as nn
import torch.nn.functional as F

class SingleHiddenLayerModule(nn.Module):
    def __init__(self):
        super().__init__()
        num_dimensions = 32
        self.embedding = nn.Embedding(vocab_size, num_dimensions)
        self.fc1 = nn.Linear(seq_len * num_dimensions, 100)
        self.dropout = nn.Dropout(0.7)
        self.fc2 = nn.Linear(100, num_classes)
        self.init()

    def forward(self, words_ids):
        x = self.embedding(words_ids) # x => torch.Size([64, 1000, 32])
        x = x.view(x.size(0), -1) # x => torch.Size([64, 16000])
        x = self.fc1(x)
        x = F.relu(x, True)
        x = self.dropout(x)
        x = self.fc2(x)
        result = x
        return result
    
    def init(self):
        torch.nn.init.constant(self.fc1.bias, val=0.0)
        torch.nn.init.constant(self.fc2.bias, val=0.0)

In [17]:
%autoreload 2

criterion = nn.CrossEntropyLoss()
model = SingleHiddenLayerModule()
if(use_cuda):
    model.cuda()
    criterion.cuda()
trainer = ModuleTrainer(model)
trainer.set_optimizer(optim.Adam, lr=1e-3)
trainer.set_loss(criterion)
trainer.set_initializers([Uniform(module_filter="embedding*", a=-0.05, b=0.05), XavierUniform(module_filter="fc*")])
trainer.set_metrics([CategoricalAccuracy()])

model

SingleHiddenLayerModule (
  (embedding): Embedding(5000, 32)
  (fc1): Linear (32000 -> 100)
  (dropout): Dropout (p = 0.7)
  (fc2): Linear (100 -> 7)
)

In [18]:
trainer.fit(trn_tensor, labels_train_tensor, val_data=(test_tensor, labels_test_tensor), 
            nb_epoch=10, batch_size=batch_size, shuffle=True)

Epoch 1/10: 429 batches [00:37, 11.34 batches/s, val_loss=1.4235, loss=1.4448, val_acc=44.17, acc=43.22]
Epoch 2/10: 429 batches [00:34,  5.75 batches/s, val_loss=1.4370, loss=0.7257, val_acc=71.67, acc=73.90]
Epoch 3/10: 429 batches [00:34,  5.77 batches/s, val_loss=1.4744, loss=0.3984, val_acc=82.65, acc=86.34]
Epoch 4/10: 429 batches [00:34, 12.54 batches/s, val_loss=1.9989, loss=0.2410, val_acc=87.62, acc=92.02]
Epoch 5/10: 429 batches [00:34, 12.51 batches/s, val_loss=2.2060, loss=0.1636, val_acc=90.18, acc=94.73]
Epoch 6/10: 429 batches [00:34,  5.71 batches/s, val_loss=2.5539, loss=0.1175, val_acc=90.99, acc=96.18]
Epoch 7/10:  63%|██████▎   | 269/428 [00:21<00:13, 12.17 batches/s, loss=0.0900, acc=97.08]


KeyboardInterrupt: 