In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
credible = pd.read_csv('corpus/credible.csv', header=None, names=['id', 'type', 'domain', 'content'])

In [None]:
credible['domain'].unique()

In [None]:
credible = credible[(credible['domain'] != 'www.msn.com') & (credible['domain'] != 'feed.reuters.com')]

In [None]:
others = credible[~credible['domain'].isin(['nytimes.com', 'nationalreview.com', 'www.reuters.com', 'weeklystandard.com'])]

In [None]:
len(others)

In [328]:
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
#from spacy.pipeline import Sentencizer

#sentencizer = Sentencizer(['.', '?', '!', '\n'])
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
import re
from unidecode import unidecode

In [3]:
class Cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys(  ))))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex(  ).sub(self, text)

In [389]:
replacements = {"\n": " ", # new line characters
                "\t": " ", # tabs
                "-": " ",
                "...": " ",
                "won't": "will not",
                "can't": "can not",
                "&": " and ",
                "\$*": "$",
                "Loading...": " ",
                "Continued...": " ",
                "\N{COPYRIGHT SIGN}": " ",
                "\N{NO-BREAK SPACE}": " ",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                '."': '".',
                '?"': '"?',
                '!"': '"!'
               }

In [331]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

ent_order = {'PERSON': 8,
            'FAC': 2,
            'ORG': 1,
            'GPE': 6,
            'LOC': 7,
            'EVENT': 3,
            'WORK_OF_ART': 5,
            'LAW': 4,
            'DATE': 9,
            'TIME': 10,
            'PERCENT': 12,
            'MONEY': 11,
            'QUANTITY': 13,
            'CARDINAL': 14,
}

drop_ents = ['NORP', 'PRODUCT', 'LANGUAGE','ORDINAL']

In [390]:
preprocess = Cleaner(replacements)

In [406]:
def process(in_doc):
    count = 0
    out_doc = ""
    #doc = re.sub(r' *\n(\n| )*\n', '\\n', in_doc)
    #doc = re.sub(r'\n([.?!])', '\\1', doc)
    doc = nlp(in_doc)
    if doc._.language['language'] != 'en':
        return np.nan
    for sent in doc.sents:
        text = sent.text
        if not re.search('[.?!] *$', text) or ':' in text or re.search(r'(?i)you', text):
            continue
        out_doc += sent.text + ' '
        count += 1
    if count < 13:
        return count
    print(count)
    ents = [ent for ent in doc.ents if ent.label_ not in drop_ents]
    ents = sorted(ents, key=lambda ent: ent_order[ent.label_])
    converted = set([])
    for ent in ents:
        if (ent.text, ent.label_) in converted:
            continue
        converted.add((ent.text, ent.label_))
        if ent.text[0] == '$':
            pattern = r'\{}\b'.format(ent.text)
        else:
            pattern = r'\b{}\b'.format(ent.text)
        out_doc = re.sub(pattern, entities.get(ent.label_, ent.text), out_doc)
    return out_doc

In [405]:
re.search(r'(?i)you', 'Your address')

<_sre.SRE_Match object; span=(0, 3), match='You'>

In [334]:
def convert_quotes(qq):
    num = 0
    if qq[-2] in ['.', '?', '!']:
        punct = qq[-2]
    else:
        punct = ''
    length = len(qq.split())
    if length <= 2:
        num = 1
    elif length <= 12:
        num = 2
    elif length <= 25:
        num = 3
    else:
        num = 4
    return 'quote ' * num + punct

def reformat(article):
    text = unidecode(article)
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = preprocess.clean(text)
    text = re.sub(r'^(.{0,50})\(\w+\)', ' ', text) # delete dateline
    text = re.sub(r'\|.*\|', ' ', text) # delete category headers, bylines, etc. between pipe symbols
    text = re.sub(r'\S*@\S+', 'email', text) # replace email address or Twitter handle with "email"
    text = re.sub(r'[-a-zA-Z0-9@:%_\+.~#?&\/=]{2,256}\.[a-z]{2,4}(\/[-a-zA-Z0-9@:%_\+.~#?&\/=]*)?', ' website',
                  text) # URLs
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", lambda m: m.group(1) + ' not', text) # replace "xxn't" contractions with "xx not"; "won't" already handled
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r"^'|'$|(?<= )'|(?<!s)'(?= )", '"', text) # replace single quotes, but not apostrophes, with double quotes
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r'(?i)please share this.*', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    return process(text)

In [77]:
political = pd.read_csv('corpus/political.csv', header=None, names=['id', 'type', 'domain', 'content'], dtype={'id': str})

In [9]:
def show_articles(df):
    for ix, row in df.sample(10).iterrows():
        yield helper(ix, row)
        
def helper(ix, row):
    print(ix, ', ', row['domain'])
    print(reformat(row['content']))

In [None]:
next(biases)

In [15]:
political['domain'].unique()

array(['baptistnews.com', 'nationalreview.com', 'dailycaller.com',
       'rinf.com', 'washingtonexaminer.com', 'breitbart.com',
       'judicialwatch.org', 'guardianlv.com', 'city-journal.org',
       'ecowatch.com', 'weeklystandard.com', 'pjmedia.com',
       'mintpressnews.com', 'freedomworks.org', 'alternet.org',
       'conservativereview.com', 'yellowhammernews.com',
       'filmsforaction.org', 'rawstory.com', 'theintercept.com',
       'commondreams.org', 'dailykos.com', 'thinkprogress.org',
       'thedailybeast.com', 'counterpunch.org', 'observer.com',
       'americannewsx.com', 'americanprogress.org', 'ronpaulinstitute.org',
       'theblaze.com', 'jacobinmag.com', 'foreignpolicyjournal.com',
       'heritage.org', 'countercurrents.org', 'newcoldwar.org',
       'nakedcapitalism.com', 'commentarymagazine.com', 'redstate.com',
       'counterinformation.wordpress.com', 'oann.com',
       'dissentmagazine.org', 'economicnoise.com', 'mrc.org',
       'advocate.com', 'ijr.com',

In [16]:
def show_contents(df):
    for domain in df['domain'].unique():
        print(domain)
        num = min(len(df[df['domain'] == domain]), 10)
        yield df[df['domain'] == domain].sample(num)

In [17]:
pol_domains = show_contents(political)

In [572]:
next(pol_domains)

StopIteration: 

In [571]:
political.loc[793439, 'content']

'“Sanctuary cities” throughout the United States have pledged to be a place of refuge and safe harbor for immigrants regardless of their documentation. This stands in stark contrast to a political party that encourages mass deportations and chants ‘Built The Wall’ that controls the mechanisms of government.\n\nIn a dramatic move Monday, Attorney General Jeff Sessions said jurisdictions must be able to demonstrate they are not sanctuary cities in order to receive any grants from the Justice Department. Jurisdictions unable to prove a negative could lose out on billions.\n\n“This disregard for law must end,” said Sessions.\n\nWhile sanctuary cities don’t actively protect undocumented immigrants, they usually have policies that avoid communication with immigration authorities.\n\n“LAPD has never participated in programs that deputize local law enforcement to act as immigration agents, and on my watch they never will,” said Los Angeles mayor Eric Garcetti.\n\nNew York’s Attorney General Er

In [475]:
political_credible = ['baptistnews.com', 'nationalreview.com', 'mintpressnews.com', 'theintercept.com', 'jacobinmag.com',
                     'foreignpolicyjournal.com', 'heritage.org', ]

In [None]:
political_biased = ['dailycaller.com', 'breitbart.com']

In [573]:
political_bogus = ['dailycaller.com', 'breitbart.com', 'weeklystandard.com', 'pjmedia.com', 'freedomworks.org',
                  'alternet.org', 'conservativereview.com', 'commondreams.org', 'dailykos.com', 'thinkprogress.org',
                  'counterpunch.org', 'americannewsx.com', 'ronpaulinstitute.org', 'theblaze.com', 'newcoldwar.org',
                  'commentarymagazine.com', 'redstate.com', 'economicnoise.com', 'mrc.org', 'ijr.com', 'thefifthcolumnnews.com'
                  ]

In [473]:
fake_keepers = ['thecommonsenseshow.com', 'rickwells.us', 'viralliberty.com', 'downtrend.com', 'thelastgreatstand.com',
              'yesimright.com', 'usasupreme.com', 'usadailytime.com', 'freedomdaily.com', 'uspoln.com', 'usanewsflash.com',
              'onepoliticalplaza.com', 'thefreepatriot.org', 'donaldtrumpnews.co', 'goneleft.com', 'onlineconservativepress.com',
              'redrocktribune.com', 'redcountry.us', 'learnprogress.org', 'usadosenews.com', 'usafirstinformation.com',
              'enhlive.com', 'flashnewscorner.com']

In [1129]:
fake = fake[fake['domain'].isin(fake_keepers)]

In [1132]:
len(fake)

14743

In [1130]:
bias_keepers = ['wnd.com', 'frontpagemag.com', 'americanthinker.com', 'dailywire.com', 'thegatewaypundit.com', 
               'antiwar.com', 'truthrevolt.org', 'patriotpost.us', 'russia-insider.com', 'paulcraigroberts.org',
               'vdare.com', 'off-guardian.org', 'jamesrgrangerjr.com', 'americablog.com', 'americasfreedomfighters.com',
               'heartland.org', 'palmerreport.com', 'thefederalistpapers.org', 'conservativetribune.com',
               'winningdemocrats.com', '100percentfedup.com', 'cowgernation.com', 'usherald.com', 'darkpolitricks.com',
               'newslogue.com', 'usapoliticstoday.com', 'counterjihad.com', 'platosguns.com', 'meanlefthook.com',
               'americanpatriotdaily.com', 'endingthefed.com', 'conservativefiringline.com', 'politicalcult.com',
               'readconservatives.news']

In [1131]:
bias = bias[bias['domain'].isin(bias_keepers)]

In [1133]:
len(bias)

195684

fake['thebigriddle.com'] = junk sci/consp
fake['itaglive.com'] = satire

In [560]:
reformat(political.loc[1530054, 'content'])

5

In [None]:
import os
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [None]:
len(scraped)

In [None]:
scraped.head()

In [None]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
scraped.head()

In [None]:
len(df1.dropna())

In [None]:
len(df1.drop('id', axis=1).dropna())

In [None]:
len(df1.drop('domain', axis=1).dropna())

In [None]:
df1 = df1.dropna()

In [None]:
df1['type'].unique()

In [None]:
len(df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')])

In [None]:
df1 = df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')]

In [None]:
df1.groupby('type').size()

In [None]:
df1[df1['type'] == 'political']['domain'].unique()

In [None]:
media_bias = pd.read_csv('data/media_bias.csv')

In [None]:
media_bias[media_bias['Vertical Rank'] >= 40]

In [None]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess

In [None]:
some_data = pd.read_json('data/abc_20181207.json')

In [None]:
some_data.loc[13, 'article']

In [None]:
len(some_data)

In [None]:
"https://cbsnews.com/world".count("/")