## Create a document linear classifier using Doc2Vec

In [1]:
import pandas as pd
import json
import glob
import os

import gensim

Using TensorFlow backend.


In [2]:
df = pd.read_csv("data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
df = df.sample(int(len(df) / 30)) # sample 1 month out of 30 for faster training

In [3]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
32267,32267,2015-05-13T15:25:00.000+03:00,nytimes.com,MOBILE TECH KEY TO AOL-VERIZON MERGER | AOL ha...,Morning Agenda: Mobile Tech Key to AOL-Verizon...,http://news.blogs.nytimes.com/2015/05/13/morni...,2
41587,41587,2017-05-21T03:45:00.000+03:00,westernjournalism.com,Saudi King Shatters Tradition To Welcome Melan...,Saudi King Shatters Tradition To Welcome Melan...,http://www.westernjournalism.com/saudi-king-sh...,5


In [4]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

In [5]:
df['docs'] = tokenize_text(df['title'].astype(str))

In [6]:
def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words(doc):
    return [token.orth_ for token in doc if not is_invalid_token(token)]

def taggedDocument(doc, bias):
    return gensim.models.doc2vec.TaggedDocument(get_words(doc), [bias])

In [7]:
df['tagged_doc'] = df.apply(lambda row: taggedDocument(row['docs'], row['bias']), axis=1)
df['tagged_doc'].head()

32267    ([Morning, Agenda, Mobile, Tech, Key, to, AOL,...
41587    ([Saudi, King, Shatters, Tradition, To, Welcom...
43816    ([D.C., judge, grants, retrial, in, killing, o...
75254    ([Tim, Kaine, Blasts, Donald, Trump, and, Mike...
83703    ([McCarthy, Dems, Will, Be, Punished, For, Gun...
Name: tagged_doc, dtype: object

In [8]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}

In [9]:
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_valid =  df.loc[df['site'].isin(TEST_DOMAINS)]

train_corpus = df_train['tagged_doc']
valid_corpus = df_valid.apply(lambda row: get_words(row['docs']), axis=1)

In [10]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [11]:
model.build_vocab(train_corpus)

In [12]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 9.59 s, sys: 2.9 s, total: 12.5 s
Wall time: 9.12 s


1542624

In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df['bias'].values)

def make_X_Y(docs_words, biases):
    X = [model.infer_vector(words) for words in docs_words]
    Y = encoder.transform(biases)
    return X, Y

In [14]:
train_words = [tagged_words.words for tagged_words in train_corpus]
X_train, Y_train = make_X_Y(train_words, df_train['bias'].values)
X_valid, Y_valid = make_X_Y(valid_corpus, df_valid['bias'].values)

In [15]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
df_valid['pred'] = encoder.inverse_transform(classifier.predict(X_valid))
df_valid.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,tagged_doc,pred
104741,104741,2014-12-19T12:46:00.000+02:00,bloomberg.com,Photographer: Jason Alden/Bloomberg Petroleo B...,Crude Below $60 Tests Petrobras’ Deepwater Dis...,http://www.bloomberg.com/news/2014-12-19/crude...,2,"(Crude, Below, $, 60, Tests, Petrobras’, Deepw...","([Crude, Below, $, Tests, Petrobras’, Deepwate...",3
82333,82333,2016-07-17T10:15:00.000+03:00,breitbart.com,"by AP 17 Jul 2016 0 17 Jul, 2016 17 Jul, 2016 ...",2 more arrests in Nice truck attack; 49 dead s...,http://www.breitbart.com/news/2-more-arrests-i...,5,"(2, more, arrests, in, Nice, truck, attack, ;,...","([more, arrests, in, Nice, truck, attack, dead...",6
54866,54866,2016-09-30T10:00:00.000+03:00,chicagotribune.com,Teammates and coaches alike refer to Maine Sou...,"Peyton Turner serves as team mom, helps Maine ...",http://www.chicagotribune.com/suburbs/suburbs/...,2,"(Peyton, Turner, serves, as, team, mom, ,, hel...","([Peyton, Turner, serves, as, team, mom, helps...",1
84934,84934,2017-04-08T00:21:00.000+03:00,breitbart.com,Gorka on Syria: ‘This Is Not a Full-Throated W...,Gorka on Syria: ‘This Is Not a Full-Throated W...,http://www.breitbart.com/radio/2017/04/07/gork...,5,"(Gorka, on, Syria, :, ‘, This, Is, Not, a, Ful...","([Gorka, on, Syria, This, Is, Not, a, Full, Th...",2
102199,102199,2015-10-12T18:18:00.000+03:00,c-span.org,Arvol Looking Horse Chief Lakota Sioux Nation ...,Millions Justice March | Video | C-SPAN.org,http://www.c-span.org/video/?328654-1/millions...,3,"(Millions, Justice, March, |, Video, |, C, -, ...","([Millions, Justice, March, |, Video, |, C], [3])",3


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Precision = %s" % precision_score(df_valid['bias'], df_valid['pred'], average='macro'))
print("Accuracy = %s" % accuracy_score(df_valid['bias'].values, df_valid['pred'].values))
# print("Recall = %s" % recall_score(df_valid['bias'].values, df_valid['pred'].values)) TODO?

Precision = 0.205178571429
Accuracy = 0.373665480427


In [18]:
df_valid['pred'].head()

104741    3
82333     6
54866     1
84934     2
102199    3
Name: pred, dtype: int64