In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
credible = pd.read_csv('corpus/credible.csv', header=None, names=['id', 'type', 'domain', 'content'])

In [None]:
credible['domain'].unique()

In [None]:
credible = credible[(credible['domain'] != 'www.msn.com') & (credible['domain'] != 'feed.reuters.com')]

In [None]:
others = credible[~credible['domain'].isin(['nytimes.com', 'nationalreview.com', 'www.reuters.com', 'weeklystandard.com'])]

In [None]:
len(others)

In [2]:
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
import re
from unidecode import unidecode

In [3]:
class Cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys(  ))))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex(  ).sub(self, text)

In [441]:
replacements = {#"\n": " ", # new line characters
                "\t": " ", # tabs
                "-": " ",
                "won't": "will not",
                "can't": "can not",
                "&": " and ",
                "$$": "$",
                "Loading...": " ",
                "Continued...": " ",
                "\N{COPYRIGHT SIGN}": " ",
                "\N{NO-BREAK SPACE}": " ",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                '."': '".',
                '?"': '"?',
                '!"': '"!',
                
               }

In [453]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

ent_order = {'PERSON': 8,
            'FAC': 2,
            'ORG': 1,
            'GPE': 6,
            'LOC': 7,
            'EVENT': 3,
            'WORK_OF_ART': 5,
            'LAW': 4,
            'DATE': 9,
            'TIME': 10,
            'PERCENT': 12,
            'MONEY': 11,
            'QUANTITY': 13,
            'CARDINAL': 14,
}

drop_ents = ['NORP', 'PRODUCT', 'LANGUAGE','ORDINAL']

In [454]:
preprocess = Cleaner(replacements)

In [468]:
def process(in_doc):
    count = 0
    out_doc = ""
    doc = nlp(in_doc)
    if doc._.language['language'] != 'en':
        return np.nan
    for sent in doc.sents:
        ending = sent[-1]
        if ending.pos_ != 'PUNCT':
            continue
        out_doc += (sent.text + ' ')
        if ending.text in ['.', '?', '!']:
            count += 1
    if count < 13:
        return count
    print(count)
    ents = [ent for ent in doc.ents if ent.label_ not in drop_ents]
    ents = sorted(ents, key=lambda ent: ent_order[ent.label_])
    converted = set([])
    for ent in ents:
        if (ent.text, ent.label_) in converted:
            continue
        converted.add((ent.text, ent.label_))
        pattern = r'\b{}\b'.format(ent.text)
        out_doc = re.sub(pattern, entities.get(ent.label_, ent.text), out_doc)
    return out_doc

In [456]:
def convert_quotes(qq):
    num = 0
    length = len(qq.split())
    if length <= 4:
        num = 1
    elif length <= 12:
        num = 2
    elif length <= 25:
        num = 3
    else:
        num = 4
    return 'quote ' * num + '.'

def reformat(article):
    text = unidecode(article)
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = preprocess.clean(text)
    text = re.sub(r'^(.{0,50})\(\w+\)', ' ', text) # delete dateline
    text = re.sub(r'\S*@\S+', 'email', text) # replace email address or Twitter handle with "email"
    text = re.sub(r' [-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', ' website',
                  text) # URLs
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", lambda m: m.group(1) + ' not', text) # replace "xxn't" contractions with "xx not"; "won't" already handled
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r"^'|'$|(?<= )'|(?<!s)'(?= )", '\1"', text) # replace single quotes, but not apostrophes, with double quotes
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text    
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r'(?i)please share this.*', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    return process(text)

In [None]:
others.sample(10)

In [None]:
fake = pd.read_csv('corpus/fake.csv', header=None, names=['id', 'type', 'domain', 'content'], dtype={'id': str})

In [None]:
fake[fake['domain'] != 'beforeitsnews.com'] .sample(10)

In [9]:
bias = pd.read_csv('corpus/bias.csv', header=None, names=['id', 'type', 'domain', 'content'], dtype={'id': str})

In [10]:
def show_articles(df):
    for ix, row in df.sample(10).iterrows():
        yield helper(ix, row)
        
def helper(ix, row):
    print(ix, ', ', row['domain'])
    print(reformat(row['content']))

In [None]:
biases = show_articles(bias)

In [None]:
next(biases)

In [48]:
bias['domain'].unique()

array(['domain', 'wnd.com', 'lifenews.com', 'dailysignal.com',
       'charismanews.com', 'frontpagemag.com', 'americanthinker.com',
       'wearechange.org', 'dailywire.com', 'lewrockwell.com',
       'thegatewaypundit.com', 'antiwar.com', 'truthrevolt.org',
       'patriotpost.us', 'journal-neo.org', 'russia-insider.com',
       'unz.com', 'paulcraigroberts.org', 'vdare.com', 'off-guardian.org',
       'veteranstoday.com', 'sputniknews.com', 'presstv.com',
       'rightwingnews.com', 'jamesrgrangerjr.com', 'pravdareport.com',
       'washingtonsblog.com', 'truepundit.com', 'americanlookout.com',
       'investmentwatchblog.com', 'aheadoftheherd.com',
       'conservativehq.com', 'moonofalabama.org', 'intrepidreport.com',
       'americablog.com', 'projectveritas.com',
       'americasfreedomfighters.com', 'orientalreview.org',
       'thenewamerican.com', 'qpolitical.com', 'katehon.com',
       'truthandaction.org', 'gulagbound.com', 'heartland.org',
       'oathkeepers.org', 'oftwom

In [11]:
def show_contents(df):
    for domain in df['domain'].unique():
        print(domain)
        num = min(len(df[df['domain'] == domain]), 10)
        yield df[df['domain'] == domain].sample(num)

In [103]:
bias_domains = show_contents(bias)

In [435]:
next(bias_domains)

meanlefthook.com


Unnamed: 0,id,type,domain,content
114545,722798,bias,meanlefthook.com,"Ah, Jan Brewer, how we despise you. And now yo..."
1108046,7757409,bias,meanlefthook.com,395 SHARES Facebook Twitter Reddit Stumbleupon...
1108284,7758309,bias,meanlefthook.com,545 SHARES Facebook Twitter Reddit Stumbleupon...
1108103,7757708,bias,meanlefthook.com,Creationist extraordinaire Ken Ham is at it ag...
114584,722933,bias,meanlefthook.com,274 SHARES Facebook Twitter Reddit Stumbleupon...
1108133,7757773,bias,meanlefthook.com,1k SHARES Facebook Twitter Reddit Stumbleupon ...
524750,3569071,bias,meanlefthook.com,2.5k SHARES Facebook Twitter Reddit Stumbleupo...
1107947,7757024,bias,meanlefthook.com,David Letterman said what we are all feeling a...
1108055,7757441,bias,meanlefthook.com,"On September 16, 2016, Terence Crutcher’s SUV ..."
1107954,7757052,bias,meanlefthook.com,The old saying goes like this: Everything's bi...


In [464]:
bias.loc[1108103, 'content']

'Creationist extraordinaire Ken Ham is at it again. He claimed that atheists have no business using words like “good” and “bad” and “right” and “wrong” because we don’t believe in God.\n\nThis goes back to the old and wrong Christian belief that atheists can’t have morals without God or the Bible. He calls on Christians to judge and call out atheists who use these terms.\n\nThe idea that atheists can’t have morals is a complete myth. This seems to be a Christian thing, but many religious people feel this way.\n\nThere is no logical connection with this argument. Saying that there is no point in being moral without a God is not really a valid argument for Christianity. This is, basically, saying that we can’t be moral without someone watching over us.\n\nKen Ham is probably best known for opening and running the Ark Encounter “museum.” It is, supposedly, a replica of Noah’s Ark. It has all kinds of Creationist exhibits inside that completely smack down scientific fact. They have things 

In [399]:
bias_keepers = ['wnd.com', 'frontpagemag.com', 'americanthinker.com', 'dailywire.com', 'thegatewaypundit.com', 
               'antiwar.com', 'truthrevolt.org', 'patriotpost.us', 'russia-insider.com', 'paulcraigroberts.org',
               'vdare.com', 'off-guardian.org', 'jamesrgrangerjr.com', 'americablog.com', 'americasfreedomfighters.com',
               'heartland.org', 'palmerreport.com', 'thefederalistpapers.org', 'conservativetribune.com',
               'winningdemocrats.com', '100percentfedup.com', 'cowgernation.com', 'usherald.com', 'darkpolitricks.com',
               'newslogue.com', 'usapoliticstoday.com', 'counterjihad.com', 'platosguns.com']

In [467]:
reformat(bias.loc[1108103, 'content'])

12


'Creationist extraordinaire person is at it again. This goes back to the old and wrong Christian belief that atheists can not have morals without person or the artwork. The idea that atheists can not have morals is a complete myth. There is no logical connection with this argument. Saying that there is no point in being moral without a person is not really a valid argument for organization. person is probably best known for opening and running the organization quote .. It has all kinds of Creationist exhibits inside that completely smack down scientific fact. Recently, he tried to use his organization to display a giant middle finger to the organization website lighting it up with rainbow lights. Yep, he tried to quote . with rainbow lights. Christians have this strange idea that the organization community has stolen the rainbow from them. This idiot just needs to go away. person image via person. '

In [420]:
for ent in nlp(bias.loc[265413, 'content']).ents:
    print(ent.text, '|', ent.label_)



 | DATE


Germany | PERSON
Berlin | GPE
Turkey | GPE
NATO | ORG
Turkish | NORP
Soner Polat | PERSON
Sputnik | PERSON
Earlier this week | DATE
Turkish | NORP
Peter Steudtner | PERSON
German | NORP
six | CARDINAL
Amnesty International’s | ORG
Turkey | GPE
Idil Eser | PERSON
Turkish | NORP
July 5 | DATE
German | NORP
Sigmar Gabriel | PERSON
Berlin | GPE
Ankara | GPE
Turkey | GPE
Germany | GPE
Turkey Spiral | ORG
Crisis https://t.co/qDS58JclNa — Janet Orendorff | FAC
July 22, 2017 | DATE
Turkish | NORP
thousands | CARDINAL
July 15, 2016 | DATE
Islamic | NORP
Fethullah Gulen | PERSON
Ankara | GPE
Germany | GPE
Turkey | GPE
MFA | ORG
— Srbija Evropa | PERSON
July 20, 2017

 | DATE
Sputnik Turkey | PERSON
Retired Rear Admiral | PERSON
the Turkish Armed Forces Soner Polat | ORG
Germany | GPE
Turkey | GPE
NATO | ORG
Turkey | GPE
Eurasia | GPE
Turkey | GPE
NATO | ORG
1952 | DATE
Germany | GPE
Turkey | GPE
NATO | ORG
Polat | PERSON
Turkey | GPE
Turkey | GPE
July 15 last year | DATE
Turkey | GPE

In [450]:
tester = set([('Hillary', 'person'), ('NAACP', 'org'), ('Whitman', 'bridge'), ('Hillary', 'person')])
tester

{('Hillary', 'person'), ('NAACP', 'org'), ('Whitman', 'bridge')}

In [None]:
def silliness(string):
    new_string = re.sub(r"^'|'$|(?<= )'|'(?= )", '"', string)
    print(new_string.count('\N{QUOTATION MARK}'))
    if new_string.count('\N{QUOTATION MARK}') % 2 != 0:
        return np.nan
    return 'WTF'

In [None]:
tester = nlp('Victims seek to resume Marcos from Heroes\' Cemetery')

In [None]:
for token in tester:
    print(token.pos_)

In [None]:
import regex

In [None]:
trial = 'How about \u00a9 for a change'
print(trial)
trial2 = re.sub('\N{COPYRIGHT SIGN}', 'this', trial)
print(trial2)

In [None]:
'How about \u00a9 for a change'.replace('\N{COPYRIGHT SIGN}', 'this')

In [None]:
print('\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}')

In [None]:
import os
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [None]:
len(scraped)

In [None]:
scraped.head()

In [None]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
scraped.head()

In [None]:
len(df1.dropna())

In [None]:
len(df1.drop('id', axis=1).dropna())

In [None]:
len(df1.drop('domain', axis=1).dropna())

In [None]:
df1 = df1.dropna()

In [None]:
df1['type'].unique()

In [None]:
len(df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')])

In [None]:
df1 = df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')]

In [None]:
df1.groupby('type').size()

In [None]:
df1[df1['type'] == 'political']['domain'].unique()

In [None]:
media_bias = pd.read_csv('data/media_bias.csv')

In [None]:
media_bias[media_bias['Vertical Rank'] >= 40]

In [None]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess

In [None]:
some_data = pd.read_json('data/abc_20181207.json')

In [None]:
some_data.loc[13, 'article']

In [None]:
len(some_data)

In [None]:
"https://cbsnews.com/world".count("/")