## Create a dataframe that stores the site, text and bias of all articles under data/source

In [1]:
import pandas as pd
import json
import glob

Load all JSON files under data/source

In [2]:
path = '../data/source/*.json'
data = []
for filename in glob.glob(path):
    with open(filename, 'r') as f:
        for line in f:
            data.append(json.loads(line))

Select columns, rename them if necessary (to avoid bcolz exception during save), and show some sample rows

In [3]:
df = pd.io.json.json_normalize(data)
df = df[['uuid', 'thread.site', 'text']]
# Remove cbn.com (no bias defined in spreadsheet)
df = df[df['thread.site'] != 'cbn.com']
df = df.rename(columns={'thread.site': 'site'})
df.head()

Unnamed: 0,uuid,site,text
0,a2547fd206cf2d182e7f58131b0445e5041be533,washingtonexaminer.com,Class action filed over United’s ‘low fare gua...
1,6e8a766deb69148bd1a840d3353d10a3d1d4590a,nydailynews.com,Jupiterimages/Getty Images/Goodshoot RF Snuggl...
2,7c14e6606642ecc8c1394458ff1cdf19fda06d06,youngcons.com,Cops have been getting a lot of negative atten...
3,608b600a0148d8257145aebb8a29c12199580d01,youngcons.com,Powered by Starbox \nIn the social media satur...
4,f94ff5791ae401d509689e8645c59f91cb8bfc15,nj.com,View/Post Comments 2013 Star-Ledger file photo...


Find the bias for each article

In [4]:
from bias import Bias

df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row.site), axis=1)
df.head()

Unnamed: 0,uuid,site,text,bias
0,a2547fd206cf2d182e7f58131b0445e5041be533,washingtonexaminer.com,Class action filed over United’s ‘low fare gua...,Bias.RIGHT_CENTER
1,6e8a766deb69148bd1a840d3353d10a3d1d4590a,nydailynews.com,Jupiterimages/Getty Images/Goodshoot RF Snuggl...,Bias.LEFT_CENTER
2,7c14e6606642ecc8c1394458ff1cdf19fda06d06,youngcons.com,Cops have been getting a lot of negative atten...,Bias.RIGHT
3,608b600a0148d8257145aebb8a29c12199580d01,youngcons.com,Powered by Starbox \nIn the social media satur...,Bias.RIGHT
4,f94ff5791ae401d509689e8645c59f91cb8bfc15,nj.com,View/Post Comments 2013 Star-Ledger file photo...,Bias.LEFT_CENTER


Store the data frame

In [33]:
import bcolz
import os

# Store the data
directory = '../data/computed'
if not os.path.exists(directory):
    os.makedirs(directory)
ct = bcolz.ctable.fromdataframe(df, rootdir= directory + 'df_site_text_bias.bcolz')