In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
credible = pd.read_csv('corpus/credible.csv', header=None, names=['id', 'type', 'domain', 'content'])

In [None]:
credible['domain'].unique()

In [None]:
credible = credible[(credible['domain'] != 'www.msn.com') & (credible['domain'] != 'feed.reuters.com')]

In [None]:
others = credible[~credible['domain'].isin(['nytimes.com', 'nationalreview.com', 'www.reuters.com', 'weeklystandard.com'])]

In [None]:
len(others)

In [2]:
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
import re
from unidecode import unidecode

In [3]:
class Cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys(  ))))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex(  ).sub(self, text)

In [441]:
replacements = {#"\n": " ", # new line characters
                "\t": " ", # tabs
                "-": " ",
                "won't": "will not",
                "can't": "can not",
                "&": " and ",
                "$$": "$",
                "Loading...": " ",
                "Continued...": " ",
                "\N{COPYRIGHT SIGN}": " ",
                "\N{NO-BREAK SPACE}": " ",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                '."': '".',
                '?"': '"?',
                '!"': '"!',
                
               }

In [453]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

ent_order = {'PERSON': 8,
            'FAC': 2,
            'ORG': 1,
            'GPE': 6,
            'LOC': 7,
            'EVENT': 3,
            'WORK_OF_ART': 5,
            'LAW': 4,
            'DATE': 9,
            'TIME': 10,
            'PERCENT': 12,
            'MONEY': 11,
            'QUANTITY': 13,
            'CARDINAL': 14,
}

drop_ents = ['NORP', 'PRODUCT', 'LANGUAGE','ORDINAL']

In [454]:
preprocess = Cleaner(replacements)

In [487]:
def process(in_doc):
    count = 0
    out_doc = ""
    doc = re.sub(r'\n[\n ]*', ' ', in_doc)
    doc = nlp(doc)
    if doc._.language['language'] != 'en':
        return np.nan
    for sent in doc.sents:
        ending = sent[-1]
        if ending.pos_ != 'PUNCT':
            continue
        out_doc += (sent.text + ' ')
        if ending.text in ['.', '?', '!']:
            count += 1
    if count < 13:
        return count
    print(count)
    ents = [ent for ent in doc.ents if ent.label_ not in drop_ents]
    ents = sorted(ents, key=lambda ent: ent_order[ent.label_])
    converted = set([])
    for ent in ents:
        if (ent.text, ent.label_) in converted:
            continue
        converted.add((ent.text, ent.label_))
        pattern = r'\b{}\b'.format(ent.text)
        out_doc = re.sub(pattern, entities.get(ent.label_, ent.text), out_doc)
    return out_doc

In [498]:
def convert_quotes(qq):
    num = 0
    if qq[-2] in ['.', '?', '!']:
        punct = qq[-2]
    else:
        punct = ''
    length = len(qq.split())
    if length <= 4:
        num = 1
    elif length <= 12:
        num = 2
    elif length <= 25:
        num = 3
    else:
        num = 4
    return 'quote ' * num + punct

def reformat(article):
    text = unidecode(article)
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = preprocess.clean(text)
    text = re.sub(r'^(.{0,50})\(\w+\)', ' ', text) # delete dateline
    text = re.sub(r'\S*@\S+', 'email', text) # replace email address or Twitter handle with "email"
    text = re.sub(r'[-a-zA-Z0-9@:%_\+.~#?&\/=]{2,256}\.[a-z]{2,4}(\/[-a-zA-Z0-9@:%_\+.~#?&\/=]*)?', ' website',
                  text) # URLs
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", lambda m: m.group(1) + ' not', text) # replace "xxn't" contractions with "xx not"; "won't" already handled
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r"^'|'$|(?<= )'|(?<!s)'(?= )", '\1"', text) # replace single quotes, but not apostrophes, with double quotes
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text    
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r'(?i)please share this.*', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    return process(text)

In [None]:
others.sample(10)

In [624]:
fake = pd.read_csv('corpus/fake.csv', header=None, names=['id', 'type', 'domain', 'content'], dtype={'id': str})

In [None]:
fake[fake['domain'] != 'beforeitsnews.com'] .sample(10)

In [9]:
bias = pd.read_csv('corpus/bias.csv', header=None, names=['id', 'type', 'domain', 'content'], dtype={'id': str})

In [10]:
def show_articles(df):
    for ix, row in df.sample(10).iterrows():
        yield helper(ix, row)
        
def helper(ix, row):
    print(ix, ', ', row['domain'])
    print(reformat(row['content']))

In [None]:
biases = show_articles(bias)

In [None]:
next(biases)

In [625]:
fake['domain'].unique()

array(['domain', 'beforeitsnews.com', 'coed.com',
       'conservativefighters.com', 'thecommonsenseshow.com',
       'usatoday.com.co', 'newswithviews.com', 'americannews.com',
       'vigilantcitizen.com', 'therightscoop.com', 'gopthedailydose.com',
       'teaparty.org', 'conservativedailypost.com', 'newslo.com',
       'rickwells.us', 'dcgazette.com', 'weeklyworldnews.com',
       'bighairynews.com', 'theinternetpost.net',
       'stormcloudsgathering.com', 'viralliberty.com', 'downtrend.com',
       'americanoverlook.com', 'aurora-news.us', 'bostonleader.com',
       'thetruthdivision.com', 'clashdaily.com', 'thelastgreatstand.com',
       'usatwentyfour.com', 'dailysurge.com', 'subjectpolitics.com',
       'conservativebyte.com', 'yesimright.com', 'usasupreme.com',
       'usadailytime.com', 'freedomdaily.com', 'uspoln.com',
       'usanewsflash.com', 'onepoliticalplaza.com', 'empirenews.net',
       'threepercenternation.com', 'thebigriddle.com',
       'newsbreakshere.com', 'pr

In [11]:
def show_contents(df):
    for domain in df['domain'].unique():
        print(domain)
        num = min(len(df[df['domain'] == domain]), 10)
        yield df[df['domain'] == domain].sample(num)

In [690]:
fake_domains = show_contents(fake)

In [1121]:
next(fake_domains)

StopIteration: 

In [1120]:
fake.loc[894105, 'content']

'A social media user, Temilola Sobola, shared the photos of the above lady’s lips mysteriously grew in double size after she allegedly used a particular lipstick called ‘Queen collection’.\n\nA girl who is always found of buying different makeup and cosmetics met this mysterious and unwanted dilemma after borrowing these lipstick from a friend.\n\nThese what her lips turns to after applying it,be self conscious.. She warned others ladies to beware Fake Lipsticks.'

In [1066]:
fake_keeper = ['thecommonsenseshow.com', 'rickwells.us', 'viralliberty.com', 'downtrend.com', 'thelastgreatstand.com',
              'yesimright.com', 'usasupreme.com', 'usadailytime.com', 'freedomdaily.com', 'uspoln.com', 'usanewsflash.com',
              'onepoliticalplaza.com', 'thefreepatriot.org', 'donaldtrumpnews.co', 'goneleft.com', 'onlineconservativepress.com',
              'redrocktribune.com', 'redcountry.us', 'learnprogress.org', 'usadosenews.com', 'usafirstinformation.com',
              'enhlive.com', 'flashnewscorner.com']

In [622]:
bias_keepers = ['wnd.com', 'frontpagemag.com', 'americanthinker.com', 'dailywire.com', 'thegatewaypundit.com', 
               'antiwar.com', 'truthrevolt.org', 'patriotpost.us', 'russia-insider.com', 'paulcraigroberts.org',
               'vdare.com', 'off-guardian.org', 'jamesrgrangerjr.com', 'americablog.com', 'americasfreedomfighters.com',
               'heartland.org', 'palmerreport.com', 'thefederalistpapers.org', 'conservativetribune.com',
               'winningdemocrats.com', '100percentfedup.com', 'cowgernation.com', 'usherald.com', 'darkpolitricks.com',
               'newslogue.com', 'usapoliticstoday.com', 'counterjihad.com', 'platosguns.com', 'meanlefthook.com',
               'americanpatriotdaily.com', 'endingthefed.com', 'conservativefiringline.com', 'politicalcult.com',
               'readconservatives.news']

fake['thebigriddle.com'] = junk sci/consp
fake['itaglive.com'] = satire

In [1044]:
reformat(fake.loc[357636, 'content'])

10

In [681]:
bias[bias['domain'] == 'lewrockwell.com'].sample(10)

Unnamed: 0,id,type,domain,content
659906,4427069,bias,lewrockwell.com,"Mervyn King, the Bank of England Governor, sum..."
76228,506738,bias,lewrockwell.com,By Dr. Mercola\n\nWater makes up at least two-...
644899,4379420,bias,lewrockwell.com,Former congressman Ron Paul revealed a list of...
316058,2513818,bias,lewrockwell.com,“‘We didn’t go in with a plan. We went in with...
59137,431534,bias,lewrockwell.com,“Americans have been told that their governmen...
172898,1776242,bias,lewrockwell.com,Ron Paul is a man of faith. His faith shines t...
239503,2120242,bias,lewrockwell.com,Scott Turow is one of my favorite escape – nov...
706863,4677093,bias,lewrockwell.com,The water supply in most American cities conta...
655948,4414979,bias,lewrockwell.com,by Christian Light\n\nRecently by Christian Li...
1108111,7757728,bias,lewrockwell.com,When you toast your other half this Valentine’...


In [450]:
tester = set([('Hillary', 'person'), ('NAACP', 'org'), ('Whitman', 'bridge'), ('Hillary', 'person')])
tester

{('Hillary', 'person'), ('NAACP', 'org'), ('Whitman', 'bridge')}

In [None]:
def silliness(string):
    new_string = re.sub(r"^'|'$|(?<= )'|'(?= )", '"', string)
    print(new_string.count('\N{QUOTATION MARK}'))
    if new_string.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    return 'WTF'

In [None]:
tester = nlp('Victims seek to resume Marcos from Heroes\' Cemetery')

In [None]:
for token in tester:
    print(token.pos_)

In [None]:
import regex

In [None]:
trial = 'How about \u00a9 for a change'
print(trial)
trial2 = re.sub('\N{COPYRIGHT SIGN}', 'this', trial)
print(trial2)

In [None]:
'How about \u00a9 for a change'.replace('\N{COPYRIGHT SIGN}', 'this')

In [None]:
print('\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}')

In [None]:
import os
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [None]:
len(scraped)

In [None]:
scraped.head()

In [None]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
scraped.head()

In [None]:
len(df1.dropna())

In [None]:
len(df1.drop('id', axis=1).dropna())

In [None]:
len(df1.drop('domain', axis=1).dropna())

In [None]:
df1 = df1.dropna()

In [None]:
df1['type'].unique()

In [None]:
len(df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')])

In [None]:
df1 = df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')]

In [None]:
df1.groupby('type').size()

In [None]:
df1[df1['type'] == 'political']['domain'].unique()

In [None]:
media_bias = pd.read_csv('data/media_bias.csv')

In [None]:
media_bias[media_bias['Vertical Rank'] >= 40]

In [None]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess

In [None]:
some_data = pd.read_json('data/abc_20181207.json')

In [None]:
some_data.loc[13, 'article']

In [None]:
len(some_data)

In [None]:
"https://cbsnews.com/world".count("/")