In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [31]:
credible = pd.read_csv('corpus/credible.csv', header=None, names=['id', 'type', 'domain', 'content'])

In [32]:
credible['domain'].unique()

array(['baptistnews.com', 'nationalreview.com', 'weeklystandard.com',
       'theintercept.com', 'foreignpolicyjournal.com', 'www.latimes.com',
       'www.cbsnews.com', 'www.nytimes.com', 'in.reuters.com',
       'af.reuters.com', 'www.msn.com', 'abcnews.go.com', 'uk.reuters.com',
       'www.usatoday.com', 'www.npr.org', 'www.wsj.com',
       'washpost.bloomberg.com', 'www.theatlantic.com',
       'www.washingtonpost.com', 'www.theguardian.com', 'www.reuters.com',
       'www.bloomberg.com', 'feeds.reuters.com', 'www.politico.com',
       'www.buzzfeed.com', 'www.nbcnews.com', 'online.wsj.com',
       'ca.reuters.com', 'nytimes.com'], dtype=object)

In [33]:
credible = credible[(credible['domain'] != 'www.msn.com') & (credible['domain'] != 'feed.reuters.com')]

In [34]:
others = credible[~credible['domain'].isin(['nytimes.com', 'nationalreview.com', 'www.reuters.com', 'weeklystandard.com'])]

In [35]:
len(others)

91269

In [36]:
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
import re
from unidecode import unidecode

In [37]:
class cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys(  ))))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex(  ).sub(self, text)

In [71]:
replacements = {"\n": "", # new line characters
                "\t": "", # tabs
                "--": "-",
                "won't": "will not",
                "&": " and ",
                "Loading...": "",
                "Continued...": "",
                "\N{COPYRIGHT SIGN}": "",
                "\N{NO-BREAK SPACE}": "",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": "",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": "",
                '."': '".',
                '?"': '"?',
                '!"': '"!'
               }

In [82]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

In [73]:
preprocess = cleaner(replacements)

In [80]:
def process(in_doc):
    count = 0
    out_doc = ''
    doc = nlp(in_doc)
    if doc._.language['language'] != 'en':
        return np.nan
    for sent in doc.sents:
        ending = sent[-1]
        if ending.pos_ != 'PUNCT':
            continue
        text = sent.text
        for ent in doc.ents:
            text = re.sub(ent.text, entities.get(ent.label_, ent.text), text)
        out_doc += (text + ' ')
        if ending.text in ['.', '?', '!']:
            count += 1
    return out_doc

In [78]:
def reformat(article):
    text = unidecode(article)
    if text.count('"') % 2 != 0:
        return np.nan
    text = preprocess.clean(text)
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", "\1 not", text) # replace "xxn't" contractions with "xx not"; "won't" already handled
    text = re.sub(r"^'|'$|(?<= )'|'(?= )", '"', text) # replace single quotes, but not apostrophes, with double quotes
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = re.sub(r'"[^"]*"', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    return process(text)

In [76]:
others.sample(10)

Unnamed: 0,id,type,domain,content
420090,8170626,reliable,uk.reuters.com,18pm GMT Italy PM Renzi turns on party rebels...
444494,8235657,reliable,www.washingtonpost.com,"A Donald Trump rally in Leesburg, Va., two day..."
18403,1038574,political,foreignpolicyjournal.com,"The world will not miss Belmokhtar, but in a p..."
48885,3288066,political,baptistnews.com,I had been a freshman in college for hardly a ...
422815,8178550,reliable,www.latimes.com,"Nov. 5, 2016, 4:39 p.m. \nStick around for liv..."
415516,8158200,reliable,uk.reuters.com,What are Donald Trump's key policies? 2:24pm G...
376277,8045302,reliable,in.reuters.com,"Technology News | Mon Nov 28, 2016 | 10:25am I..."
360171,7998944,reliable,www.washingtonpost.com,CAIRO — Egypt has unearthed a city and cemeter...
382989,8065819,reliable,www.theguardian.com,Does this sound familiar? “The American people...
435784,8213307,reliable,online.wsj.com,Donald Trump has won the battleground states o...


In [81]:
reformat(others.loc[364073, 'content'])

"Let friends in your social network know what you are reading about orhanization number things you need to know date The biggest news to start your date. Post to orhanization number things you need to know date The biggest news to start your date. Check out this story on USATODAY.com: orhanization A link has been sent to your friend's email address. Posted! A link has been posted to your orhanization feed. EST date person speaks in place on date. orhanization to meet with former place Gov. person A lot has changed since person competed against place Gov. person and number other rivals for the orhanization presidential nomination. On date, the president-elect will meet with person , who may have a spot in the orhanization as head of the orhanization or orhanization departments or the landmark. person is number of several high-profile visitors who have met with orhanization, which includes date presidential candidate person, former place Mayor person and place Gov. person. Also on date, 

In [83]:
others.loc[364073, 'content']

'Let friends in your social network know what you are reading about Facebook Email 5 things you need to know Monday The biggest news to start your day. Post to Facebook 5 things you need to know Monday The biggest news to start your day. Check out this story on USATODAY.com: http://usat.ly/2gcprwB Cancel Send A link has been sent to your friend\'s email address. Posted! A link has been posted to your Facebook feed. Join the Nation\'s Conversation To find out more about Facebook commenting please read the Conversation Guidelines and FAQs 5 things you need to know Monday Editors, USA TODAY 4:10 a.m. EST November 21, 2016 Donald Trump speaks in Grand Rapids on Nov. 8, 2016. (Photo: Paul Sancya, AP) Trump to meet with former Texas Gov. Rick Perry \nA lot has changed since Donald Trump competed against Texas Gov. Rick Perry and more than a dozen other rivals for the GOP presidential nomination. On Monday, the president-elect will meet with Perry , who may have a spot in the Cabinet as head 

In [None]:
for token in tester:
    print(token, token.lemma_)

In [None]:
import regex

In [None]:
trial = 'How about \u00a9 for a change'
print(trial)
trial2 = re.sub('\N{COPYRIGHT SIGN}', 'this', trial)
print(trial2)

In [None]:
'How about \u00a9 for a change'.replace('\N{COPYRIGHT SIGN}', 'this')

In [None]:
print('\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}')

In [None]:
import os
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [None]:
len(scraped)

In [None]:
scraped.head()

In [None]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
scraped.head()

In [None]:
len(df1.dropna())

In [None]:
len(df1.drop('id', axis=1).dropna())

In [None]:
len(df1.drop('domain', axis=1).dropna())

In [None]:
df1 = df1.dropna()

In [None]:
df1['type'].unique()

In [None]:
len(df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')])

In [None]:
df1 = df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')]

In [None]:
df1.groupby('type').size()

In [None]:
df1[df1['type'] == 'political']['domain'].unique()

In [None]:
media_bias = pd.read_csv('data/media_bias.csv')

In [None]:
media_bias[media_bias['Vertical Rank'] >= 40]

In [None]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess

In [None]:
some_data = pd.read_json('data/abc_20181207.json')

In [None]:
some_data.loc[13, 'article']

In [None]:
len(some_data)

In [None]:
"https://cbsnews.com/world".count("/")