## Create a document vector linear classifier using Doc2Vec

In [7]:
import pandas as pd
import json
import glob
import os

import gensim

In [8]:
# Where the JSON files are located
source = '../data/source/part-00000-*.json'
# Where the processed data will be stored
features_directory = '../data/text-spacy-features-original'

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'tensorizer'])
nlp.pipeline # this should be empty

[]

In [10]:
def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

In [11]:
def build_df_from_json_files():
    all_dfs = []
    for filename in glob.glob(source):
        print('Parsing file {}'.format(filename))
        df = read_df(filename)
        all_dfs.append(df)
    
    # Merge all daframes together
    df = pd.concat(all_dfs)
    # It takes about 15 seconds per month in my laptop
    df['doc'] = tokenize_text(df['text'])
    
    df['bias'] = df['bias'].astype(str)
    
    # Change bias type to string (avoid issue with sklearn not knowing some datatypes when measuring precision)
    return df

In [12]:
import gzip

def filename_root(filename):
    filename_no_path = os.path.basename(filename)
    return features_directory + '/' + filename_no_path

def read_df(filename):
    df = pd.read_pickle(filename_root(filename) + '.df_site_text_bias.pickle.gz', compression="gzip")
    return df

In [13]:
df = build_df_from_json_files()
df.head()

Parsing file ../data/source/part-00000-fdfd9a6e-3c71-4540-91f5-559870381531.json


Unnamed: 0,uuid,thread.site,text,bias,doc
0,a2547fd206cf2d182e7f58131b0445e5041be533,washingtonexaminer.com,Class action filed over United’s ‘low fare gua...,Bias.RIGHT_CENTER,"(Class, action, filed, over, United, ’s, ‘, lo..."
1,6e8a766deb69148bd1a840d3353d10a3d1d4590a,nydailynews.com,Jupiterimages/Getty Images/Goodshoot RF Snuggl...,Bias.LEFT_CENTER,"(Jupiterimages, /, Getty, Images, /, Goodshoot..."
2,7c14e6606642ecc8c1394458ff1cdf19fda06d06,youngcons.com,Cops have been getting a lot of negative atten...,Bias.RIGHT,"(Cops, have, been, getting, a, lot, of, negati..."
3,608b600a0148d8257145aebb8a29c12199580d01,youngcons.com,Powered by Starbox \nIn the social media satur...,Bias.RIGHT,"(Powered, by, Starbox, \n, In, the, social, me..."
4,f94ff5791ae401d509689e8645c59f91cb8bfc15,nj.com,View/Post Comments 2013 Star-Ledger file photo...,Bias.LEFT_CENTER,"(View, /, Post, Comments, 2013, Star, -, Ledge..."


In [14]:
def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words(doc):
    return [token.orth_ for token in doc if not is_invalid_token(token)]

def taggedDocument(doc, bias):
    return gensim.models.doc2vec.TaggedDocument(get_words(doc), [bias])

In [15]:
df['tagged_doc'] = df.apply(lambda row: taggedDocument(row['doc'], row['bias']), axis=1)
df['tagged_doc'].head()

0    ([Class, action, filed, over, United, ’s, low,...
1    ([Jupiterimages, Getty, Images, Goodshoot, RF,...
2    ([Cops, have, been, getting, a, lot, of, negat...
3    ([Powered, by, Starbox, In, the, social, media...
4    ([View, Post, Comments, Star, Ledger, file, ph...
Name: tagged_doc, dtype: object

In [16]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}

In [17]:
df_train =  df.loc[~df['thread.site'].isin(TEST_DOMAINS)]
df_valid =  df.loc[df['thread.site'].isin(TEST_DOMAINS)]

train_corpus = df_train['tagged_doc']
valid_corpus = df_valid.apply(lambda row: get_words(row['doc']), axis=1)

In [18]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [19]:
model.build_vocab(train_corpus)

In [20]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 2min 56s, sys: 3.55 s, total: 3min
Wall time: 1min 12s


79158212

In [21]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df['bias'].values)

def make_X_Y(docs_words, biases):
    X = [model.infer_vector(words) for words in docs_words]
    Y = encoder.transform(biases)
    return X, Y

In [22]:
train_words = [tagged_words.words for tagged_words in train_corpus]
X_train, Y_train = make_X_Y(train_words, df_train['bias'].values)
X_valid, Y_valid = make_X_Y(valid_corpus, df_valid['bias'].values)

In [23]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
df_valid['pred'] = encoder.inverse_transform(classifier.predict(X_valid))
df_valid.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,uuid,thread.site,text,bias,doc,tagged_doc,pred
23,fef6237649701e532f2259233761d97f66750cc3,chron.com,Assembly Republicans launch their election cam...,Bias.LEFT_CENTER,"(Assembly, Republicans, launch, their, electio...","([Assembly, Republicans, launch, their, electi...",Bias.LEFT_CENTER
36,58d57269304c0263c9b46ac5b3a4282f9e994f65,chron.com,"Rio Rancho 98, Rio Grande 62 Santa Fe 50, Capi...",Bias.LEFT_CENTER,"(Rio, Rancho, 98, ,, Rio, Grande, 62, Santa, F...","([Rio, Rancho, Rio, Grande, Santa, Fe, Capital...",Bias.LEFT_CENTER
38,46d5b0fbab2fbd193c8dd82372a7513a3e51ddaa,chron.com,"Arizona Charter Academy 53, Mayer 22 Chandler ...",Bias.LEFT_CENTER,"(Arizona, Charter, Academy, 53, ,, Mayer, 22, ...","([Arizona, Charter, Academy, Mayer, Chandler, ...",Bias.EXTREME_RIGHT
44,7b6bd4746b80570f30b5caeb9f09447882a88363,breitbart.com,Ingraham: GOP Establishment Now ‘Firmly Entren...,Bias.RIGHT,"(Ingraham, :, GOP, Establishment, Now, ‘, Firm...","([Ingraham, GOP, Establishment, Now, Firmly, E...",Bias.EXTREME_RIGHT
48,fad0e219cca7c610abcf88850f40a183d12b843b,chicagotribune.com,Welcome to the most comprehensive suburban cov...,Bias.LEFT_CENTER,"(Welcome, to, the, most, comprehensive, suburb...","([Welcome, to, the, most, comprehensive, subur...",Bias.LEFT_CENTER


In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Precision = %s" % precision_score(df_valid['bias'], df_valid['pred'], average='macro'))
print("Accuracy = %s" % accuracy_score(df_valid['bias'].values, df_valid['pred'].values))
# print("Recall = %s" % recall_score(df_valid['bias'].values, df_valid['pred'].values)) TODO?

Precision = 0.228005865103
Accuracy = 0.395209580838


In [27]:
df_valid['pred'].head()

23      Bias.LEFT_CENTER
36      Bias.LEFT_CENTER
38    Bias.EXTREME_RIGHT
44    Bias.EXTREME_RIGHT
48      Bias.LEFT_CENTER
Name: pred, dtype: object