In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


In [59]:
credible = pd.read_csv('corpus/credible.csv', usecols=['id', 'type', 'domain', 'content'])

In [60]:
credible.head()

Unnamed: 0,id,type,domain,content
0,160,political,baptistnews.com,It is rather hard to write with eclipse glasse...
1,161,political,baptistnews.com,"As our nation celebrated Labor Day, giving att..."
2,185,political,baptistnews.com,We hear quite a bit about survivors’ guilt the...
3,187,political,baptistnews.com,The study of the Hebrew language did not come ...
4,188,political,baptistnews.com,Social media is blowing up about a culture of ...


In [13]:
credible['domain'].unique()

array(['domain', 'baptistnews.com', 'nationalreview.com',
       'weeklystandard.com', 'theintercept.com',
       'foreignpolicyjournal.com', 'www.latimes.com', 'www.cbsnews.com',
       'www.nytimes.com', 'in.reuters.com', 'af.reuters.com',
       'abcnews.go.com', 'uk.reuters.com', 'www.usatoday.com',
       'www.npr.org', 'www.wsj.com', 'washpost.bloomberg.com',
       'www.theatlantic.com', 'www.washingtonpost.com',
       'www.theguardian.com', 'www.reuters.com', 'www.bloomberg.com',
       'feeds.reuters.com', 'www.politico.com', 'www.buzzfeed.com',
       'www.nbcnews.com', 'online.wsj.com', 'ca.reuters.com', 'nytimes.com'], dtype=object)

In [2]:
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')

nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
import re
from unidecode import unidecode

In [3]:
class Cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys())))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex().sub(self, text)

In [4]:
replacements = {"\n": " ",
                "\t": " ",
                "-": " ",
                "...": " ",
                "won't": "will not",
                "can't": "can not",
                "&": " and ",
                "\$*": "$",
                "Loading...": " ",
                "Continued...": " ",
                "\N{COPYRIGHT SIGN}": " ",
                "\N{NO-BREAK SPACE}": " ",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                '."': '".',
                '?"': '"?',
                '!"': '"!'
               }

In [5]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

ent_order = {'PERSON': 8,
            'FAC': 2,
            'ORG': 1,
            'GPE': 6,
            'LOC': 7,
            'EVENT': 3,
            'WORK_OF_ART': 5,
            'LAW': 4,
            'DATE': 9,
            'TIME': 10,
            'PERCENT': 12,
            'MONEY': 11,
            'QUANTITY': 13,
            'CARDINAL': 14,
}

drop_ents = ['NORP', 'PRODUCT', 'LANGUAGE','ORDINAL']

In [6]:
preprocess = Cleaner(replacements)

In [7]:
def process(in_doc):
    count = 0
    out_doc = ""
    doc = nlp(in_doc)
    if doc._.language['language'] != 'en':
        return None
    colon_count = 0
    for sent in doc.sents:
        text = sent.text
        if ':' in text:
            colon_count += 1
        if not re.search('[.?!] *$', text) or re.search(r'(?i)you', text): # direct appeal to reader or not a sentence
            continue
        out_doc += sent.text + ' '
        count += 1
    if count < 13 or colon_count > 6: # too short for training or likely contains many unquoted quotations
        return None
    ents = list(set([ent for ent in doc.ents if ent.label_ not in drop_ents]))
    ents = sorted(ents, key=lambda ent: ent_order[ent.label_])
    for ent in ents:
        if ent.text[0] == '$':
            pattern = r'\{}\b'.format(ent.text) # match money strings, not first word
        else:
            pattern = r'\b{}\b'.format(ent.text) # only match pattern as a word, not part of a word
        out_doc = re.sub(pattern, entities.get(ent.label_, ent.text), out_doc)
    ents2 = set([ent for ent in nlp(out_doc).ents if ent.label_ == 'PERSON'])
    for ent in ents2:
        pattern = r'\b{}\b'.format(ent.text) 
        out_doc = re.sub(pattern, 'person', out_doc)
    return out_doc

In [287]:
def convert_quotes(qq):
    num = 0
    if qq[-2] in ['.', '?', '!']:
        punct = qq[-2]
    else:
        punct = ''
    length = len(qq.split())
    if length <= 2:
        num = 1
    elif length <= 12:
        num = 2
    elif length <= 25:
        num = 3
    else:
        num = 4
    return 'quote ' * num + punct

def reformat(article):
    if not article:
        return None
    if type(article) is not str:
        return None
    text = unidecode(article)
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return None
    text = preprocess.clean(text)
    text = re.sub(r'^(.{0,50})\(\w+\)', ' ', text) # delete dateline
    text = re.sub(r'\|.*\|', ' ', text) # delete category headers, bylines, etc. between pipe symbols
    text = re.sub(r'\S*@\S+', 'email', text) # replace email address or Twitter handle with "email"
    text = re.sub(r'[-a-zA-Z0-9@:%_\+.~#?&\/=]{2,256}\.[a-z]{2,4}(\/[-a-zA-Z0-9@:%_\+.~#?&\/=]*)?', ' website',
                  text) # URLs
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", lambda m: m.group(1) + ' not', text) # replace remaining "xxn't" contractions with "xx not"
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r"^'|'$|(?<= )'|(?<!s)'(?= )", '"', text) # replace single quotes, but not apostrophes, with double quotes
    if text.count('\N{QUOTATION MARK}') % 2 != 0: # unbalanced quotation marks would cause improper processing 
        return None
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r'(?i)please share this.*', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    try:
        output = process(text)
    except:
        output = None
    return output

In [9]:
def show_articles(df):
    for ix, row in df.sample(10).iterrows():
        yield helper(ix, row)
        
def helper(ix, row):
    print(ix, ', ', row['domain'])
    print(reformat(row['content']))

In [10]:
def show_contents(df):
    for domain in df['domain'].unique():
        print(domain)
        num = min(len(df[df['domain'] == domain]), 10)
        yield df[df['domain'] == domain].sample(num)

In [236]:
conspiracy = pd.read_csv('corpus/conspiracy.csv')

In [12]:
conspiracy['domain'].unique()

array(['blackgenocide.org', 'canadafreepress.com', 'awarenessact.com',
       'familysecuritymatters.org', 'zerohedge.com', 'jihadwatch.org',
       'themindunleashed.com', 'activistpost.com', 'infowars.com',
       'humansarefree.com', 'informationclearinghouse.info',
       'wikispooks.com', 'rense.com', 'prisonplanet.com',
       '21stcenturywire.com', 'thelibertybeacon.com', 'dcclothesline.com',
       'educate-yourself.org', 'henrymakow.com', 'americanfreepress.net',
       'nodisinfo.com', 'thedailysheeple.com', '82.221.129.208',
       'intellihub.com', 'assassinationscience.com',
       'whatreallyhappened.com', 'conspiracyplanet.com',
       'abovetopsecret.com', 'gaia.com', 'corbettreport.com',
       'whowhatwhy.org', 'govtslaves.info', 'pamelageller.com',
       'infiniteunknown.net', 'theeconomiccollapseblog.com', 'shoebat.com',
       'endoftheamericandream.com', 'fromthetrenchesworldreport.com',
       'countdowntozerotime.com', 'politicalblindspot.com',
       'jesus-is

In [237]:
consp_domains = show_contents(conspiracy)

In [269]:
next(consp_domains)

prisonplanet.com


Unnamed: 0,id,type,domain,content
105889,781497,conspiracy,prisonplanet.com,France proposes NATO military intervention\n\n...
99355,475689,conspiracy,prisonplanet.com,"Zero Hedge\n\nNovember 9, 2016\n\nFor months, ..."
323329,3532680,conspiracy,prisonplanet.com,"Get Alex Jones and Paul Joseph Watson's books,..."
97965,452748,conspiracy,prisonplanet.com,Paul Craig Roberts\n\nPrison Planet.com\n\nOct...
50476,299023,conspiracy,prisonplanet.com,Steve Watson\n\nPrison Planet.com\n\nDecember ...
308762,3104834,conspiracy,prisonplanet.com,Homeland Security prepares to entrench total o...
322784,3466492,conspiracy,prisonplanet.com,Dyncorp and Halliburton Sex Slave Scandal Won'...
403092,3997060,conspiracy,prisonplanet.com,Another Dubious Osama Tape Appears When The Ne...
223028,2295196,conspiracy,prisonplanet.com,Survey finds more Americans now believe UAVs u...
350872,3762903,conspiracy,prisonplanet.com,McCain Bill Is Lethal Injection For Internet F...


In [270]:
conspiracy.loc[308762, 'content']



In [272]:
consp_keepers = ['infowars.com', 'prisonplanet.com', '21stcenturywire.com',
                'educate-yourself.org', 'henrymakow.com', 'nodisinfo.com', 'intellihub.com',
                'pamelageller.com', 'shoebat.com', 'countdowntozerotime.com', 'thepoliticalinsider.com',
                'greanvillepost.com', 'whatdoesitmean.com', 'angrypatriotmovement.com', 'dataasylum.com',
                'sonsoflibertyradio.com', 'nasamoonhoax.com', 'sheepkillers.com']

In [271]:
reformat(conspiracy.loc[308762, 'content'])



In [276]:
def clean_file(infile, domains, outfile):
    reader = pd.read_csv(infile, chunksize=10000)
    cleaned = []
    count = 0
    for chunk in reader:
        count += 1
        if count % 10 == 0:
            print(count)
        for row in chunk.itertuples():
            if row.domain in domains:
                article = reformat(row.content)
                if article:
                    cleaned.append((row.id, row.domain, article))
    df_cleaned = pd.DataFrame(cleaned, columns=['id', 'domain', 'content'])
    print(len(df_cleaned))
    df_cleaned.to_csv(outfile)

In [36]:
hate_keepers = ['barenakedislam.com', 'barnesreview.org', 'ihr.org', 'drrichswier.com', 'davidduke.com',
               'returnofkings.com', 'nationalvanguard.org', 'themuslimissue.wordpress.com', 'darkmoon.me',
               'glaringhypocrisy.com', 'truthfeed.com']

In [24]:
clean_file('corpus/hate.csv', hate_keepers, 'cleaned/hate_cleaned.csv')

In [16]:
political_credible = ['baptistnews.com', 'nationalreview.com', 'mintpressnews.com', 'theintercept.com', 'jacobinmag.com',
                     'foreignpolicyjournal.com', 'heritage.org', ]

In [30]:
clean_file('corpus/political.csv', political_credible, 'cleaned/pol_cred_cleaned.csv')

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
315751


In [36]:
political_bogus = ['dailycaller.com', 'breitbart.com', 'weeklystandard.com', 'pjmedia.com', 'freedomworks.org',
                  'alternet.org', 'conservativereview.com', 'commondreams.org', 'dailykos.com', 'thinkprogress.org',
                  'counterpunch.org', 'americannewsx.com', 'ronpaulinstitute.org', 'theblaze.com', 'newcoldwar.org',
                  'commentarymagazine.com', 'redstate.com', 'economicnoise.com', 'mrc.org', 'ijr.com', 'thefifthcolumnnews.com'
                  ]

In [37]:
clean_file('corpus/political.csv', political_bogus, 'cleaned/pol_bogus_cleaned.csv')

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
477775


In [31]:
fake_keepers = ['thecommonsenseshow.com', 'rickwells.us', 'viralliberty.com', 'downtrend.com', 'thelastgreatstand.com',
              'yesimright.com', 'usasupreme.com', 'usadailytime.com', 'freedomdaily.com', 'uspoln.com', 'usanewsflash.com',
              'onepoliticalplaza.com', 'thefreepatriot.org', 'donaldtrumpnews.co', 'goneleft.com', 'onlineconservativepress.com',
              'redrocktribune.com', 'redcountry.us', 'learnprogress.org', 'usadosenews.com', 'usafirstinformation.com',
              'enhlive.com', 'flashnewscorner.com']

In [32]:
clean_file('corpus/fake.csv', fake_keepers, 'cleaned/fake_cleaned.csv')

10
20
30
40
50
60
70
80
90
6081


In [33]:
bias_keepers = ['wnd.com', 'frontpagemag.com', 'americanthinker.com', 'dailywire.com', 'thegatewaypundit.com', 
               'antiwar.com', 'truthrevolt.org', 'patriotpost.us', 'russia-insider.com', 'paulcraigroberts.org',
               'vdare.com', 'off-guardian.org', 'jamesrgrangerjr.com', 'americablog.com', 'americasfreedomfighters.com',
               'heartland.org', 'palmerreport.com', 'thefederalistpapers.org', 'conservativetribune.com',
               'winningdemocrats.com', '100percentfedup.com', 'cowgernation.com', 'usherald.com', 'darkpolitricks.com',
               'newslogue.com', 'usapoliticstoday.com', 'counterjihad.com', 'platosguns.com', 'meanlefthook.com',
               'americanpatriotdaily.com', 'endingthefed.com', 'conservativefiringline.com', 'politicalcult.com',
               'readconservatives.news']

In [34]:
clean_file('corpus/bias.csv', bias_keepers, 'cleaned/bias_cleaned.csv')

10
20
30
40
50
60
70
80
90
100
110
88823


In [277]:
clean_file('corpus/conspiracy.csv', consp_keepers, 'cleaned/conspiracy_cleaned.csv')

10
20
30
40
50
60
70
80
17103


In [None]:
import os

In [None]:
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [274]:
len(scraped)

65490

In [None]:
scraped.head()

In [275]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

28670

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
scraped.head()

In [None]:
len(df1.dropna())

In [None]:
len(df1.drop('id', axis=1).dropna())

In [None]:
len(df1.drop('domain', axis=1).dropna())

In [None]:
df1 = df1.dropna()

In [None]:
df1['type'].unique()

In [None]:
len(df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')])

In [None]:
df1 = df1[(df1['type'] != 'unreliable') & (df1['type'] !='unknown')]

In [None]:
df1.groupby('type').size()

In [None]:
df1[df1['type'] == 'political']['domain'].unique()