In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [59]:
credible = pd.read_csv('corpus/credible.csv', usecols=['id', 'type', 'domain', 'content'])

In [60]:
credible.head()

Unnamed: 0,id,type,domain,content
0,160,political,baptistnews.com,It is rather hard to write with eclipse glasse...
1,161,political,baptistnews.com,"As our nation celebrated Labor Day, giving att..."
2,185,political,baptistnews.com,We hear quite a bit about survivors’ guilt the...
3,187,political,baptistnews.com,The study of the Hebrew language did not come ...
4,188,political,baptistnews.com,Social media is blowing up about a culture of ...


In [13]:
credible['domain'].unique()

array(['domain', 'baptistnews.com', 'nationalreview.com',
       'weeklystandard.com', 'theintercept.com',
       'foreignpolicyjournal.com', 'www.latimes.com', 'www.cbsnews.com',
       'www.nytimes.com', 'in.reuters.com', 'af.reuters.com',
       'abcnews.go.com', 'uk.reuters.com', 'www.usatoday.com',
       'www.npr.org', 'www.wsj.com', 'washpost.bloomberg.com',
       'www.theatlantic.com', 'www.washingtonpost.com',
       'www.theguardian.com', 'www.reuters.com', 'www.bloomberg.com',
       'feeds.reuters.com', 'www.politico.com', 'www.buzzfeed.com',
       'www.nbcnews.com', 'online.wsj.com', 'ca.reuters.com', 'nytimes.com'], dtype=object)

In [2]:
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
#from spacy.pipeline import Sentencizer

#sentencizer = Sentencizer(['.', '?', '!', '\n'])
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
import re
from unidecode import unidecode

In [3]:
class Cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys())))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex(  ).sub(self, text)

In [4]:
replacements = {"\n": " ", # new line characters
                "\t": " ", # tabs
                "-": " ",
                "...": " ",
                "won't": "will not",
                "can't": "can not",
                "&": " and ",
                "\$*": "$",
                "Loading...": " ",
                "Continued...": " ",
                "\N{COPYRIGHT SIGN}": " ",
                "\N{NO-BREAK SPACE}": " ",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                '."': '".',
                '?"': '"?',
                '!"': '"!'
               }

In [5]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

ent_order = {'PERSON': 8,
            'FAC': 2,
            'ORG': 1,
            'GPE': 6,
            'LOC': 7,
            'EVENT': 3,
            'WORK_OF_ART': 5,
            'LAW': 4,
            'DATE': 9,
            'TIME': 10,
            'PERCENT': 12,
            'MONEY': 11,
            'QUANTITY': 13,
            'CARDINAL': 14,
}

drop_ents = ['NORP', 'PRODUCT', 'LANGUAGE','ORDINAL']

In [6]:
preprocess = Cleaner(replacements)

In [20]:
def process(in_doc):
    count = 0
    out_doc = ""
    doc = nlp(in_doc)
    if doc._.language['language'] != 'en':
        return None
    colon_count = 0
    for sent in doc.sents:
        text = sent.text
        if not re.search('[.?!] *$', text) or re.search(r'(?i)you', text): # direct appeal to reader or not a sentence
            continue
        if ':' in text:
            colon_count += 1
        out_doc += sent.text + ' '
        count += 1
    if count < 13 or colon_count > 6: # too short for training or likely contains many unquoted quotations
        return None
    ents = list(set([ent for ent in doc.ents if ent.label_ not in drop_ents]))
    ents = sorted(ents, key=lambda ent: ent_order[ent.label_])
    for ent in ents:
        if ent.text[0] == '$':
            pattern = r'\{}\b'.format(ent.text) # match money strings, not first word
        else:
            pattern = r'\b{}\b'.format(ent.text) # only match pattern as a word, not part of a word
        out_doc = re.sub(pattern, entities.get(ent.label_, ent.text), out_doc)
    ents2 = set([ent for ent in nlp(out_doc).ents if ent.label_ == 'PERSON'])
    for ent in ents2:
        pattern = r'\b{}\b'.format(ent.text) 
        out_doc = re.sub(pattern, 'person', out_doc)
    return out_doc

In [21]:
def convert_quotes(qq):
    num = 0
    if qq[-2] in ['.', '?', '!']:
        punct = qq[-2]
    else:
        punct = ''
    length = len(qq.split())
    if length <= 2:
        num = 1
    elif length <= 12:
        num = 2
    elif length <= 25:
        num = 3
    else:
        num = 4
    return 'quote ' * num + punct

def reformat(article):
    if not article:
        return None
    text = unidecode(article)
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return None
    text = preprocess.clean(text)
    text = re.sub(r'^(.{0,50})\(\w+\)', ' ', text) # delete dateline
    text = re.sub(r'\|.*\|', ' ', text) # delete category headers, bylines, etc. between pipe symbols
    text = re.sub(r'\S*@\S+', 'email', text) # replace email address or Twitter handle with "email"
    text = re.sub(r'[-a-zA-Z0-9@:%_\+.~#?&\/=]{2,256}\.[a-z]{2,4}(\/[-a-zA-Z0-9@:%_\+.~#?&\/=]*)?', ' website',
                  text) # URLs
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", lambda m: m.group(1) + ' not', text) # replace remaining "xxn't" contractions with "xx not"
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r"^'|'$|(?<= )'|(?<!s)'(?= )", '"', text) # replace single quotes, but not apostrophes, with double quotes
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return None
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r'(?i)please share this.*', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    try:
        output = process(text)
    except:
        output = None
    return output

In [9]:
def show_articles(df):
    for ix, row in df.sample(10).iterrows():
        yield helper(ix, row)
        
def helper(ix, row):
    print(ix, ', ', row['domain'])
    print(reformat(row['content']))

In [10]:
def show_contents(df):
    for domain in df['domain'].unique():
        print(domain)
        num = min(len(df[df['domain'] == domain]), 10)
        yield df[df['domain'] == domain].sample(num)

In [158]:
hate_domains = show_contents(hate)

In [188]:
next(hate_domains)

returnofkings.com


Unnamed: 0,id,type,domain,content
75169,7630212,hate,returnofkings.com,The show has not had George R.R. Martin’s book...
48619,2290820,hate,returnofkings.com,The Book of Mormon came from Joseph Smith’s tr...
56852,3936343,hate,returnofkings.com,"Growing up in middle-class white America, I wa..."
74407,7607546,hate,returnofkings.com,Western Canada is Disneyland for black men who...
75333,7630939,hate,returnofkings.com,"Next Tuesday is the American election, and, fo..."
48465,2290641,hate,returnofkings.com,The Swedish government has been through a toug...
58857,4054755,hate,returnofkings.com,"From Britain to the United States and beyond, ..."
58406,4035548,hate,returnofkings.com,I created this page for those who want to leav...
57525,3970645,hate,returnofkings.com,"As many of you may know, Roosh V’s successful ..."
48595,2290784,hate,returnofkings.com,The next Q&A video answers questions that were...


In [None]:
# use returnofkings.com, id 56852

In [193]:
hate.loc[56852, 'content']

'Growing up in middle-class white America, I was constantly told, “Don’t judge a book by its cover!” This wasn’t merely a feminist rallying cry; it was a lesson in how to view all sects of society—blacks, nerds, jocks, fat people, rednecks, thugs, et cetera. Of course, we all learned to parrot this doctrine without questioning it, even if we rarely actually followed it. It made sense to us—there’s so much more to a book than the three words written on its cover. To judge a book before reading it would be prejudice, and there is no worse sin for a middle-class white child than to judge another person before truly getting to know them and their circumstances.\n\nAssuming blue pill dogma is true, this is at best a false analogy. Covers are designed to market a book. The title and pictures inform you what kind of book it is, and there will likely be a synopsis and genre label on the back. Publishing companies totally want you to judge the book by its cover, hence why they put so much effor

In [194]:
reformat(hate.loc[56852, 'content'])

"Growing up in middle class white place, I was constantly told, quote quote ! This was not merely a feminist rallying cry; it was a lesson in how to view all sects of society blacks, nerds, jocks, fat people, rednecks, thugs, et cetera. Of course, we all learned to parrot this doctrine without questioning it, even if we rarely actually followed it. It made sense to us there's so much more to a book than the number words written on its cover. To judge a book before reading it would be prejudice, and there is no worse sin for a middle class white child than to judge another person before truly getting to know them and their circumstances. Assuming blue pill dogma is true, this is at best a false analogy. Covers are designed to market a book. If I want to read some bondage erotica, I'm not going to pick a book with a spaceship on the front. However, blue pill dogma is heresy. Ubiquitous and mandatory, but heresy nonetheless. That's just not appropriate. At this point the casual reader wil

In [24]:
reader = pd.read_csv('corpus/hate.csv', chunksize=5000)
cleaned = []
for chunk in reader:
    for row in chunk.itertuples():
        if row.domain in hate_keepers:
            article = reformat(row.content)
            if article:
                cleaned.append((row.id, row.domain, article))
hate_cleaned = pd.DataFrame(cleaned, columns=['id', 'domain', 'content'])

In [26]:
len(hate_cleaned)

13700

In [28]:
hate_cleaned.to_csv('cleaned/hate_cleaned.csv')

In [16]:
hate_keepers = ['barenakedislam.com', 'barnesreview.org', 'ihr.org', 'drrichswier.com', 'davidduke.com',
               'returnofkings.com', 'nationalvanguard.org', 'themuslimissue.wordpress.com', 'darkmoon.me',
               'glaringhypocrisy.com', 'truthfeed.com']

In [None]:
political_credible = ['baptistnews.com', 'nationalreview.com', 'mintpressnews.com', 'theintercept.com', 'jacobinmag.com',
                     'foreignpolicyjournal.com', 'heritage.org', ]

In [None]:
political_bogus = ['dailycaller.com', 'breitbart.com', 'weeklystandard.com', 'pjmedia.com', 'freedomworks.org',
                  'alternet.org', 'conservativereview.com', 'commondreams.org', 'dailykos.com', 'thinkprogress.org',
                  'counterpunch.org', 'americannewsx.com', 'ronpaulinstitute.org', 'theblaze.com', 'newcoldwar.org',
                  'commentarymagazine.com', 'redstate.com', 'economicnoise.com', 'mrc.org', 'ijr.com', 'thefifthcolumnnews.com'
                  ]

In [None]:
bogus_pol = political[political['domain'].isin(political_bogus)]
len(bogus_pol)

In [None]:
cred_pol = political[political['domain'].isin(political_credible)]
len(cred_pol)

In [None]:
fake_keepers = ['thecommonsenseshow.com', 'rickwells.us', 'viralliberty.com', 'downtrend.com', 'thelastgreatstand.com',
              'yesimright.com', 'usasupreme.com', 'usadailytime.com', 'freedomdaily.com', 'uspoln.com', 'usanewsflash.com',
              'onepoliticalplaza.com', 'thefreepatriot.org', 'donaldtrumpnews.co', 'goneleft.com', 'onlineconservativepress.com',
              'redrocktribune.com', 'redcountry.us', 'learnprogress.org', 'usadosenews.com', 'usafirstinformation.com',
              'enhlive.com', 'flashnewscorner.com']

In [None]:
fake = fake[fake['domain'].isin(fake_keepers)]

In [None]:
len(fake)

In [None]:
bias_keepers = ['wnd.com', 'frontpagemag.com', 'americanthinker.com', 'dailywire.com', 'thegatewaypundit.com', 
               'antiwar.com', 'truthrevolt.org', 'patriotpost.us', 'russia-insider.com', 'paulcraigroberts.org',
               'vdare.com', 'off-guardian.org', 'jamesrgrangerjr.com', 'americablog.com', 'americasfreedomfighters.com',
               'heartland.org', 'palmerreport.com', 'thefederalistpapers.org', 'conservativetribune.com',
               'winningdemocrats.com', '100percentfedup.com', 'cowgernation.com', 'usherald.com', 'darkpolitricks.com',
               'newslogue.com', 'usapoliticstoday.com', 'counterjihad.com', 'platosguns.com', 'meanlefthook.com',
               'americanpatriotdaily.com', 'endingthefed.com', 'conservativefiringline.com', 'politicalcult.com',
               'readconservatives.news']

In [None]:
bias = bias[bias['domain'].isin(bias_keepers)]

In [155]:
hate.head()

Unnamed: 0,id,type,domain,content
1,6,hate,barenakedislam.com,"Unfortunately, he hasn’t yet attacked her for ..."
2,7,hate,barenakedislam.com,The Los Angeles Police Department has been den...
3,8,hate,barenakedislam.com,The White House has decided to quietly withdra...
4,9,hate,barenakedislam.com,“The time has come to cut off the tongues of t...
5,10,hate,barenakedislam.com,The Central American nation and six other stat...


fake['thebigriddle.com'] = junk sci/consp
fake['itaglive.com'] = satire

In [None]:
hate.loc[3, 'content']

In [137]:
reformat(hate.loc[53490, 'content'])

" place modification has drastically changed skies and weather systems across the globe, especially in date. Go anywhere in the world and it's quickly apparent that the same heinous crimes are being committed, and no matter which terms are used chemtrails, geoengineering, solar radiation management , stratospheric aerosol injection , and so on the results are person inspired skies, weird weather and lacklustre light. The skies are streaked, smeared, filled with bizarre cloud formations, white, grey or, at best, a chalky blue. Sunshine is greatly diminished and when it shines it's extremely rare for the earth to be bathed in warm, yellow light. time, a gift for photographers, has all but disappeared. While Sean and I have witnessed countless freakish skies and so called weather anomalies over date, here's number recent example that illustrates how geoengineering is very much the cause of quote . date, we had a very strange shift in weather from very hot to temperate. For the vast majori

In [None]:
import os
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [None]:
len(scraped)

In [None]:
scraped.head()

In [None]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
scraped.head()

In [None]:
len(df1.dropna())

In [None]:
len(df1.drop('id', axis=1).dropna())

In [None]:
len(df1.drop('domain', axis=1).dropna())

In [None]:
df1 = df1.dropna()

In [None]:
df1['type'].unique()

In [None]:
len(df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')])

In [None]:
df1 = df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')]

In [None]:
df1.groupby('type').size()

In [None]:
df1[df1['type'] == 'political']['domain'].unique()

In [None]:
media_bias = pd.read_csv('data/media_bias.csv')

In [151]:
tester = pd.DataFrame(test_data, columns=['first', 'second', 'third'])

In [150]:
test_data = [(1, 2, 3), (3,2,1), (4,6,10)]

In [152]:
tester

Unnamed: 0,first,second,third
0,1,2,3
1,3,2,1
2,4,6,10


In [153]:
tester.append(tester)

Unnamed: 0,first,second,third
0,1,2,3
1,3,2,1
2,4,6,10
0,1,2,3
1,3,2,1
2,4,6,10


In [154]:
tester

Unnamed: 0,first,second,third
0,1,2,3
1,3,2,1
2,4,6,10
