## Create a dataframe that stores the site, text and bias of all articles + tokenizes the text using spacy

In [1]:
import pandas as pd
import json
import glob
import os

In [2]:
# Where the JSON files are located
source = '../data/source/*.json'
# Where the processed data will be stored
save_directory = '../data/text-spacy-features'

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

Load all JSON files under data/source

In [3]:
# returns the last processed df and docs
def apply_on_each_json_file(f):
    for filename in glob.glob(source):
        articles = []
        with open(filename, 'r') as file:
            for line in file:
                articles.append(json.loads(line))
        df, docs = f(articles, filename)
    return df, docs

Select features and remove unknown sites

In [4]:
def select_features(articles):
    df = pd.io.json.json_normalize(articles)
    df = df[['uuid', 'thread.site', 'text']]
    # Remove cbn.com (no bias defined in spreadsheet)
    df = df[df['thread.site'] != 'cbn.com']
    return df

Find the bias for each article

In [5]:
from bias import Bias

def add_bias_to_df(df):
    df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['thread.site']), axis=1)
    return df

Store the select features

In [6]:
import gzip

def filename_root(filename):
    filename_no_path = os.path.basename(filename)
    return save_directory + '/' + filename_no_path

def persist_df_and_docs(filename, df, docs):
    # store the dataframe
    df.to_pickle(filename_root(filename) + '.df_site_text_bias.pickle.gz', compression="gzip")
    # store the parsed docs in binary format (doesn't work with pickle)
    with gzip.open(filename_root(filename) +'docs.bin.gz', 'wb') as f:
        for doc in docs:
            f.write(doc.to_bytes())

Split the text with spacy's tokenizer

In [7]:
import spacy
nlp = spacy.load('en')

In [8]:
def tokenize_text(df):
    docs = [doc for doc in nlp.pipe(df['text'], batch_size=10000, n_threads=4)]
    return docs

In [9]:
def process_articles(articles, filename):
    df = select_features(articles)
    df = add_bias_to_df(df)
    docs = %time tokenize_text(df)
    %time persist_df_and_docs(filename, df, docs)
    print('NLP proccessed {} articles from {}'.format(len(df), filename))
    return df, docs

In [10]:
from spacy.tokens.doc import Doc

def read_docs_from_file(filename):
    docs = []
    with gzip.open(filename, 'rb') as file:
        for byte_string in Doc.read_bytes(file):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    return docs

def read_df_and_docs(filename):
    # store the dataframe
    df = pd.read_pickle(filename_root(filename) + '.df_site_text_bias.pickle.gz', compression="gzip")
    # store the parsed docs in binary format (doesn't work with pickle)
    docs = read_docs_from_file(filename_root(filename) +'docs.bin.gz')
    return df, docs

Run the pipeline:

In [11]:
df, docs = apply_on_each_json_file(process_articles)

CPU times: user 2min 13s, sys: 764 ms, total: 2min 14s
Wall time: 54.8 s
CPU times: user 48.6 s, sys: 421 ms, total: 49.1 s
Wall time: 48.7 s
NLP proccessed 3437 articles from ../data/source/part-00000-fdfd9a6e-3c71-4540-91f5-559870381531.json
CPU times: user 2min 22s, sys: 2.01 s, total: 2min 25s
Wall time: 51 s
CPU times: user 50.6 s, sys: 472 ms, total: 51 s
Wall time: 50.6 s
NLP proccessed 4637 articles from ../data/source/part-00001-fdfd9a6e-3c71-4540-91f5-559870381531.json
CPU times: user 2min 10s, sys: 667 ms, total: 2min 11s
Wall time: 48.8 s
CPU times: user 46.4 s, sys: 275 ms, total: 46.6 s
Wall time: 46.1 s
NLP proccessed 3163 articles from ../data/source/part-00002-fdfd9a6e-3c71-4540-91f5-559870381531.json
CPU times: user 4min 6s, sys: 2.67 s, total: 4min 9s
Wall time: 1min 19s
CPU times: user 53.2 s, sys: 362 ms, total: 53.5 s
Wall time: 53 s
NLP proccessed 4468 articles from ../data/source/part-00003-fdfd9a6e-3c71-4540-91f5-559870381531.json
CPU times: user 4min 5s, sys: 

Verify the data has been correctly stored:

In [12]:
df, docs = read_df_and_docs('part-00000-fdfd9a6e-3c71-4540-91f5-559870381531.json')
df.head()

Unnamed: 0,uuid,thread.site,text,bias
0,a2547fd206cf2d182e7f58131b0445e5041be533,washingtonexaminer.com,Class action filed over United’s ‘low fare gua...,Bias.RIGHT_CENTER
1,6e8a766deb69148bd1a840d3353d10a3d1d4590a,nydailynews.com,Jupiterimages/Getty Images/Goodshoot RF Snuggl...,Bias.LEFT_CENTER
2,7c14e6606642ecc8c1394458ff1cdf19fda06d06,youngcons.com,Cops have been getting a lot of negative atten...,Bias.RIGHT
3,608b600a0148d8257145aebb8a29c12199580d01,youngcons.com,Powered by Starbox \nIn the social media satur...,Bias.RIGHT
4,f94ff5791ae401d509689e8645c59f91cb8bfc15,nj.com,View/Post Comments 2013 Star-Ledger file photo...,Bias.LEFT_CENTER


In [13]:
docs[:1]

[Class action filed over United’s ‘low fare guarantee’ Legal Newsline Staff Writer • | January 29, 2015 | 4:14 pm 
 HOUSTON (Legal Newsline) - A Peoria County, Ill., man sued United Airlines on Jan. 21, alleging misrepresentation of its online fare prices, wrongful conduct and deceptive practices. 
 Scott Coulier alleged United's “low fare guarantee” pricing advertised on its website is fraudulent. He traveled from Peoria International Airport to Orlando Airport on March 1 with two family members, having purchased three one-way tickets in a single transaction on Jan. 26, 2014. He alleged in the suit that he could have bought the tickets for a lower overall price if he had purchased them separately. Please enter your email address below to begin receiving the Politics Today newsletter. You must enter a valid email address in the field above! Thank you for signing up for the Politics Today newsletter! You should receive your first newsletter very soon. We're sorry, there was an error pro