In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [2]:
import spacy
import re
from unidecode import unidecode
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

In [3]:
class Cleaner(dict):
    """ Multiple-string-substitution dict """
    def _make_regex(self):
        """ Build re object based on the keys of the dictionary it is instantiated with"""
        return re.compile("|".join(map(re.escape, self.keys())))

    def __call__(self, match):
        """ Handler invoked for each regex match """
        return self[match.group(0)]

    def clean(self, text):
        """ Substitutes with value for each key and returns the modified text. """
        return self._make_regex().sub(self, text)

In [4]:
replacements = {"\n": " ",
                "\t": " ",
                "-": " ",
                "...": " ",
                "won't": "will not",
                "can't": "can not",
                "&": " and ",
                "\$*": "$",
                "Loading...": " ",
                "Continued...": " ",
                "\N{COPYRIGHT SIGN}": " ",
                "\N{NO-BREAK SPACE}": " ",
                "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}": " ",
                '."': '".',
                '?"': '"?',
                '!"': '"!'
               }

In [5]:
entities = {'PERSON': 'person',
            'FAC': 'landmark',
            'ORG': 'organization',
            'GPE': 'place',
            'LOC': 'location',
            'EVENT': 'event',
            'WORK_OF_ART': 'artwork',
            'LAW': 'law',
            'DATE': 'date',
            'TIME': 'time',
            'PERCENT': 'percent',
            'MONEY': 'money',
            'QUANTITY': 'quantity',
            'CARDINAL': 'number'
}

ent_order = {'PERSON': 8,
            'FAC': 2,
            'ORG': 1,
            'GPE': 6,
            'LOC': 7,
            'EVENT': 3,
            'WORK_OF_ART': 5,
            'LAW': 4,
            'DATE': 9,
            'TIME': 10,
            'PERCENT': 12,
            'MONEY': 11,
            'QUANTITY': 13,
            'CARDINAL': 14,
}

drop_ents = ['NORP', 'PRODUCT', 'LANGUAGE','ORDINAL']

In [6]:
preprocess = Cleaner(replacements)

In [7]:
def process(in_doc):
    count = 0
    out_doc = ""
    doc = nlp(in_doc)
    if doc._.language['language'] != 'en':
        return None
    colon_count = 0
    for sent in doc.sents:
        text = sent.text
        if ':' in text:
            colon_count += 1
        if not re.search('[.?!] *$', text) or re.search(r'(?i)you', text): # direct appeal to reader or not a sentence
            continue
        out_doc += sent.text + ' '
        count += 1
    if count < 13 or colon_count > 6: # too short for training or likely contains many unquoted quotations
        return None
    ents = list(set([ent for ent in doc.ents if ent.label_ not in drop_ents]))
    ents = sorted(ents, key=lambda ent: ent_order[ent.label_])
    for ent in ents:
        if ent.text[0] == '$':
            pattern = r'\{}\b'.format(ent.text) # match money strings, not first word
        else:
            pattern = r'\b{}\b'.format(ent.text) # only match pattern as a word, not part of a word
        out_doc = re.sub(pattern, entities.get(ent.label_, ent.text), out_doc)
    ents2 = set([ent for ent in nlp(out_doc).ents if ent.label_ == 'PERSON'])
    for ent in ents2:
        pattern = r'\b{}\b'.format(ent.text) 
        out_doc = re.sub(pattern, 'person', out_doc)
    return out_doc

In [8]:
def convert_quotes(qq):
    num = 0
    if qq[-2] in ['.', '?', '!']:
        punct = qq[-2]
    else:
        punct = ''
    length = len(qq.split())
    if length <= 2:
        num = 1
    elif length <= 12:
        num = 2
    elif length <= 25:
        num = 3
    else:
        num = 4
    return 'quote ' * num + punct

def reformat(article):
    if not article:
        return None
    if type(article) is not str:
        return None
    text = unidecode(article)
    if text.count('\N{QUOTATION MARK}') % 2 != 0:
        return None
    text = preprocess.clean(text)
    text = re.sub(r'^(.{0,50})\(\w+\)', ' ', text) # delete dateline
    text = re.sub(r'\|.*\|', ' ', text) # delete category headers, bylines, etc. between pipe symbols
    text = re.sub(r'\S*@\S+', 'email', text) # replace email address or Twitter handle with "email"
    text = re.sub(r'[-a-zA-Z0-9@:%_\+.~#?&\/=]{2,256}\.[a-z]{2,4}(\/[-a-zA-Z0-9@:%_\+.~#?&\/=]*)?', ' website',
                  text) # URLs
    text = re.sub('[\[\(][^\[\(]*[\]\)]', '', text) # delete text inside parentheses or brackets
    text = re.sub(r"\b(\w*)n't", lambda m: m.group(1) + ' not', text) # replace remaining "xxn't" contractions with "xx not"
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r"^'|'$|(?<= )'|(?<!s)'(?= )", '"', text) # replace single quotes, but not apostrophes, with double quotes
    if text.count('\N{QUOTATION MARK}') % 2 != 0: # unbalanced quotation marks would cause improper processing 
        return None
    text = re.sub(r'("[^"]*")', lambda m: convert_quotes(m.group(1)), text) # replace quoted text
    text = re.sub(r'(?i)please share this.*', '', text)
    text = re.sub(' +', ' ', text) # reduce all multiple spaces to single spaces
    try:
        output = process(text)
    except:
        output = None
    return output

In [9]:
def show_articles(df):
    for ix, row in df.sample(10).iterrows():
        yield helper(ix, row)
        
def helper(ix, row):
    print(ix, ', ', row['domain'])
    print(reformat(row['content']))

In [10]:
def show_contents(df):
    for domain in df['domain'].unique():
        print(domain)
        num = min(len(df[df['domain'] == domain]), 10)
        yield df[df['domain'] == domain].sample(num)

In [236]:
conspiracy = pd.read_csv('corpus/conspiracy.csv')

In [12]:
conspiracy['domain'].unique()

array(['blackgenocide.org', 'canadafreepress.com', 'awarenessact.com',
       'familysecuritymatters.org', 'zerohedge.com', 'jihadwatch.org',
       'themindunleashed.com', 'activistpost.com', 'infowars.com',
       'humansarefree.com', 'informationclearinghouse.info',
       'wikispooks.com', 'rense.com', 'prisonplanet.com',
       '21stcenturywire.com', 'thelibertybeacon.com', 'dcclothesline.com',
       'educate-yourself.org', 'henrymakow.com', 'americanfreepress.net',
       'nodisinfo.com', 'thedailysheeple.com', '82.221.129.208',
       'intellihub.com', 'assassinationscience.com',
       'whatreallyhappened.com', 'conspiracyplanet.com',
       'abovetopsecret.com', 'gaia.com', 'corbettreport.com',
       'whowhatwhy.org', 'govtslaves.info', 'pamelageller.com',
       'infiniteunknown.net', 'theeconomiccollapseblog.com', 'shoebat.com',
       'endoftheamericandream.com', 'fromthetrenchesworldreport.com',
       'countdowntozerotime.com', 'politicalblindspot.com',
       'jesus-is

In [237]:
consp_domains = show_contents(conspiracy)

In [269]:
next(consp_domains)

prisonplanet.com


Unnamed: 0,id,type,domain,content
105889,781497,conspiracy,prisonplanet.com,France proposes NATO military intervention\n\n...
99355,475689,conspiracy,prisonplanet.com,"Zero Hedge\n\nNovember 9, 2016\n\nFor months, ..."
323329,3532680,conspiracy,prisonplanet.com,"Get Alex Jones and Paul Joseph Watson's books,..."
97965,452748,conspiracy,prisonplanet.com,Paul Craig Roberts\n\nPrison Planet.com\n\nOct...
50476,299023,conspiracy,prisonplanet.com,Steve Watson\n\nPrison Planet.com\n\nDecember ...
308762,3104834,conspiracy,prisonplanet.com,Homeland Security prepares to entrench total o...
322784,3466492,conspiracy,prisonplanet.com,Dyncorp and Halliburton Sex Slave Scandal Won'...
403092,3997060,conspiracy,prisonplanet.com,Another Dubious Osama Tape Appears When The Ne...
223028,2295196,conspiracy,prisonplanet.com,Survey finds more Americans now believe UAVs u...
350872,3762903,conspiracy,prisonplanet.com,McCain Bill Is Lethal Injection For Internet F...


In [270]:
conspiracy.loc[308762, 'content']



In [272]:
consp_keepers = ['infowars.com', 'prisonplanet.com', '21stcenturywire.com',
                'educate-yourself.org', 'henrymakow.com', 'nodisinfo.com', 'intellihub.com',
                'pamelageller.com', 'shoebat.com', 'countdowntozerotime.com', 'thepoliticalinsider.com',
                'greanvillepost.com', 'whatdoesitmean.com', 'angrypatriotmovement.com', 'dataasylum.com',
                'sonsoflibertyradio.com', 'nasamoonhoax.com', 'sheepkillers.com']

In [271]:
reformat(conspiracy.loc[308762, 'content'])



In [11]:
def clean_file(infile, domains, outfile, outcount):
    reader = pd.read_csv(infile, chunksize=10000)
    cleaned = []
    count = 0
    for chunk in reader:
        count += 1
        if count % 10 == 0:
            print(count)
        for row in chunk.itertuples():
            if row.domain in domains:
                article = reformat(row.content)
                if article:
                    cleaned.append((row.id, row.domain, article))
    df_cleaned = pd.DataFrame(cleaned, columns=['id', 'domain', 'content'])
    outcount = len(df_cleaned)
    print(outcount)
    df_cleaned.to_csv(outfile)
    return outcount

In [36]:
hate_keepers = ['barenakedislam.com', 'barnesreview.org', 'ihr.org', 'drrichswier.com', 'davidduke.com',
               'returnofkings.com', 'nationalvanguard.org', 'themuslimissue.wordpress.com', 'darkmoon.me',
               'glaringhypocrisy.com', 'truthfeed.com']

In [24]:
hate_count = clean_file('corpus/hate.csv', hate_keepers, 'cleaned/hate_cleaned.csv')

In [161]:
unreliable_counts = [hate_count]
unreliable_counts

[13700]

In [16]:
political_credible = ['baptistnews.com', 'nationalreview.com', 'mintpressnews.com', 'theintercept.com', 'jacobinmag.com',
                     'foreignpolicyjournal.com', 'heritage.org', ]

In [30]:
pol_cred_count = clean_file('corpus/political.csv', political_credible, 'cleaned/pol_cred_cleaned.csv', )

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
315751


In [165]:
reliable_counts = [pol_cred_count]
reliable_counts

[315751]

In [36]:
political_bogus = ['dailycaller.com', 'breitbart.com', 'weeklystandard.com', 'pjmedia.com', 'freedomworks.org',
                  'alternet.org', 'conservativereview.com', 'commondreams.org', 'dailykos.com', 'thinkprogress.org',
                  'counterpunch.org', 'americannewsx.com', 'ronpaulinstitute.org', 'theblaze.com', 'newcoldwar.org',
                  'commentarymagazine.com', 'redstate.com', 'economicnoise.com', 'mrc.org', 'ijr.com', 'thefifthcolumnnews.com'
                  ]

In [37]:
pol_bogus_count = clean_file('corpus/political.csv', political_bogus, 'cleaned/pol_bogus_cleaned.csv')

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
477775


In [166]:
unreliable_counts.append(pol_bogus_count)
unreliable_counts

[13700, 477775]

In [31]:
fake_keepers = ['thecommonsenseshow.com', 'rickwells.us', 'viralliberty.com', 'downtrend.com', 'thelastgreatstand.com',
              'yesimright.com', 'usasupreme.com', 'usadailytime.com', 'freedomdaily.com', 'uspoln.com', 'usanewsflash.com',
              'onepoliticalplaza.com', 'thefreepatriot.org', 'donaldtrumpnews.co', 'goneleft.com', 'onlineconservativepress.com',
              'redrocktribune.com', 'redcountry.us', 'learnprogress.org', 'usadosenews.com', 'usafirstinformation.com',
              'enhlive.com', 'flashnewscorner.com']

In [32]:
fake_count = clean_file('corpus/fake.csv', fake_keepers, 'cleaned/fake_cleaned.csv')

10
20
30
40
50
60
70
80
90
6081


In [167]:
unreliable_counts.append(fake_count)

In [33]:
bias_keepers = ['wnd.com', 'frontpagemag.com', 'americanthinker.com', 'dailywire.com', 'thegatewaypundit.com', 
               'antiwar.com', 'truthrevolt.org', 'patriotpost.us', 'russia-insider.com', 'paulcraigroberts.org',
               'vdare.com', 'off-guardian.org', 'jamesrgrangerjr.com', 'americablog.com', 'americasfreedomfighters.com',
               'heartland.org', 'palmerreport.com', 'thefederalistpapers.org', 'conservativetribune.com',
               'winningdemocrats.com', '100percentfedup.com', 'cowgernation.com', 'usherald.com', 'darkpolitricks.com',
               'newslogue.com', 'usapoliticstoday.com', 'counterjihad.com', 'platosguns.com', 'meanlefthook.com',
               'americanpatriotdaily.com', 'endingthefed.com', 'conservativefiringline.com', 'politicalcult.com',
               'readconservatives.news']

In [34]:
bias_count = clean_file('corpus/bias.csv', bias_keepers, 'cleaned/bias_cleaned.csv')

10
20
30
40
50
60
70
80
90
100
110
88823


In [168]:
unreliable_counts.append(bias_count)

In [277]:
conspiracy_count = clean_file('corpus/conspiracy.csv', consp_keepers, 'cleaned/conspiracy_cleaned.csv')

10
20
30
40
50
60
70
80
17103


In [169]:
unreliable_counts.append(conspiracy_count)

In [170]:
sum(unreliable_counts)

603482

Too many of the articles (309,063) come from dailykos.com, so this set will be reduced to 450,000.

In [31]:
import os

In [171]:
scraper_data = os.listdir('./data')
scraped = pd.DataFrame()
for file in scraper_data:
    try:
        df = pd.read_json('./data/{}'.format(file))
        scraped = pd.concat([scraped, df])
    except:
        pass

In [172]:
len(scraped)

70024

In [179]:
scraped = scraped.drop_duplicates(['id'], keep='last')
len(scraped)

31194

In [180]:
scraped.to_csv('corpus/scraped.csv')

In [181]:
scraped.head()

Unnamed: 0,article,id
0,CARACAS (Reuters) - Venezuelan opposition lead...,https://www.reuters.com/article/us-venezuela-p...
1,TRIPOLI (Reuters) - The airport in the Libyan ...,https://www.reuters.com/article/us-libya-secur...
2,BERLIN (Reuters) - The leader of Germany’s rul...,https://www.reuters.com/article/us-eu-reform-g...
3,DUBAI (Reuters) - Iranian President Hassan Rou...,https://www.reuters.com/article/us-iran-pakist...
4,LAGOS (Reuters) - Nigerian voters returned to ...,https://www.reuters.com/article/us-nigeria-ele...


In [183]:
cleaned = []
count = 99999999
for row in scraped.itertuples():
    article = reformat(row.article)
    if article:
        count += 1
        cleaned.append((count, row.id, article))
scraped_cleaned = pd.DataFrame(cleaned, columns=['id', 'domain', 'content'])

[315751, 31194, 31194]

In [197]:
scraped_count = len(scraped_cleaned)
reliable_counts.append(scraped_count)
#scraped_cleaned.to_csv('cleaned/scraped_cleaned.csv')
reliable_counts

[315751, 21650]

In [187]:
scraped_cleaned.head()

Unnamed: 0,id,domain,content
0,100000000,https://www.reuters.com/article/us-venezuela-p...,Venezuelan opposition leader person place on ...
1,100000001,https://www.reuters.com/article/us-nigeria-ele...,Nigerian voters returned to the polls on date...
2,100000002,https://www.reuters.com/article/us-italy-polit...,place's prime minister said on date tenders f...
3,100000003,https://www.reuters.com/article/us-mideast-cri...,The U.S. backed organization paused military ...
4,100000004,https://www.reuters.com/article/us-mideast-cri...,The organization refugee agency should have a...


In [206]:
needed = 450000 - sum(reliable_counts)
needed

75163

In [44]:
credible = pd.read_csv('corpus/credible.csv')

In [45]:
len(credible)

2015498

In [143]:
cred_samples = show_contents(credible)

In [158]:
next(cred_samples)

www.cbsnews.com


Unnamed: 0.1,Unnamed: 0,id,type,domain,content
358371,358373,7993238,reliable,www.cbsnews.com,"Father reveals 4-year-old's ""true face of canc..."
375559,376728,8046599,reliable,www.cbsnews.com,Singer Adele hints she is pregnant at concert ...
413479,417364,8162997,reliable,www.cbsnews.com,"The Battle for Mosul, The National Mood, The Z..."
395324,397924,8111131,reliable,www.cbsnews.com,"By Reena Flores CBS News November 27, 2016, 10..."
469562,476951,8324610,reliable,www.cbsnews.com,Cabinet hopefuls include Trump critics Novembe...
387124,389111,8084683,reliable,www.cbsnews.com,Stumble \nDETROIT -- Aretha Franklin’s lengthy...
414060,417983,8164659,reliable,www.cbsnews.com,"AP November 5, 2016, 12:35 PM Supreme Court OK..."
444452,450168,8250493,reliable,www.cbsnews.com,PARIS -- The flashbacks come to Denys Plaud un...
395957,398589,8113038,reliable,www.cbsnews.com,"SALEM, Mass. — Authorities say a drunken drive..."
375286,376438,8045695,reliable,www.cbsnews.com,Kanye West is on the road to recovery.\nThe 39...


In [159]:
credible.loc[395324, 'content']

'By Reena Flores CBS News November 27, 2016, 10:06 AM Donald Trump rages against Hillary Clinton team over recount efforts in Wisconsin Jill Stein recount attempt in Wisconsin: What you need to know \nMr. Trump took to his favorite social media platform, Twitter, calling the efforts “sad” and declared that “nothing will change” despite the time and money that will be spent. Hillary Clinton conceded the election when she called me just prior to the victory speech and after the results were in. Nothing will change — Donald J. Trump (@realDonaldTrump) November 27, 2016 \nThe president-elect also mentioned Clinton’s remarks during one presidential debate, when she called it “horrifying” that Mr. Trump was declining to say he would absolutely accept the general election results. Hillary\'s debate answer on delay: "That is horrifying. That is not the way our democracy works. Been around for 240 years. We\'ve had free -- — Donald J. Trump (@realDonaldTrump) November 27, 2016 and fair election

In [160]:
reformat(credible.loc[395324, 'content']) # another example of what the reformat() function does

"person conceded the election when she called me just prior to the victory speech and after the results were in. Nothing will change Donald J. person email date The president elect also mentioned person's remarks during number presidential debate, when she called it quote that Mr. person was declining to say he would absolutely accept the general election results. quote quote quote quote . Then, separately she stated, quote quote quote quote . She then said, quote quote quote quote . So much time and money will be spent same result! On date after the election tallies came in, person, in a concession speech , had urged her supporters to quote the results. quote quote she said at the time. quote quote . The tweetstorm date was a continuation of the president elect's rampage date on person's quote recount attempts, when he issued a lengthy statement about how the donations now amounting to over $6 million were just meant to quote quote . quote quote quote quote he said in a statement issu

In [130]:
credible = credible[~credible['domain'].isin(['in.reuters.com', 'af.reuters.com', 'uk.reuters.com',
                                             'washpost.bloomberg.com', 'www.theguardian.com', 'www.reuters.com',
                                             'feeds.reuters.com', 'online.wsj.com', 'ca.reuters.com'
                                            ])]

In [199]:
len(credible)

1948505

In [131]:
credible.groupby('domain').count().iloc[:,0].sort_values()

domain
www.theatlantic.com             612
www.buzzfeed.com                619
foreignpolicyjournal.com        889
www.politico.com               1556
www.nbcnews.com                1576
theintercept.com               2198
www.npr.org                    2736
www.cbsnews.com                3253
www.usatoday.com               4061
www.bloomberg.com              4156
www.nytimes.com                4550
www.washingtonpost.com         5067
www.latimes.com                5146
www.wsj.com                    5546
baptistnews.com                7642
abcnews.go.com                 8652
weeklystandard.com            28005
nationalreview.com           319556
nytimes.com                 1542685
Name: Unnamed: 0, dtype: int64

In [189]:
credible1 = credible[~credible['domain'].isin(['nationalreview.com', 'nytimes.com'])]

In [190]:
len(credible1)

86264

In [202]:
cleaned = []
for row in credible1.itertuples():
    article = reformat(row.content)
    if article:
        cleaned.append((row.id, row.domain, article))
credible1_cleaned = pd.DataFrame(cleaned, columns=['id', 'domain', 'content'])

In [209]:
credible1_count = len(credible1_cleaned)
credible1_cleaned.to_csv('cleaned/credible1_cleaned.csv')
reliable_counts.append(credible1_count)
needed = needed - credible1_count
needed

37727

In [214]:
credible2 = credible[credible['domain'] == 'nytimes.com']

In [215]:
len(credible2)

1542685

In [227]:
cleaned = []
limit = 53520 # Assembly of data sets in reliable_news.ipynb requires 53,520 NYT articles
count = 0
for row in credible2.itertuples():
    article = reformat(row.content)
    if article and count < limit:
        cleaned.append((row.id, row.domain, article))
        count += 1
    if count == limit:
        break

credible2_cleaned = pd.DataFrame(cleaned, columns=['id', 'domain', 'content'])
credible2_cleaned.to_csv('cleaned/credible2_cleaned.csv')
len(credible2_cleaned)

53520

In [228]:
credible2_cleaned.head()

Unnamed: 0,id,domain,content
0,8335330,nytimes.com,"Groomed by a golden age of national triumph, A..."
1,8335331,nytimes.com,"date, the organization arcades are light, brig..."
2,8335333,nytimes.com,But the number dissimilar sets of characters s...
3,8335334,nytimes.com,President Boris N. person's resignation date d...
4,8335335,nytimes.com,"The tradition, which began date at the church,..."
