In [555]:
% matplotlib inline
import os
import json
import random
import re
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1) Load articles

In [810]:
cred_fp = '/ebs_volume/data/Credible/'
ncred_fp = '/ebs_volume/data/notCredible/'

articles = pd.DataFrame(columns=('label',
                                 'text',
                                 'title',
                                 'date',
                                 'source'))
i = 0    
for root, dirs, files in os.walk(cred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    if data["source"] == "new-york-times":
                        articles.loc[i] = [0,data["text"],data["title"],data["date"],"the-new-york-times"]
                    else:                        
                        articles.loc[i] = [0,data["text"],data["title"],data["date"],data["source"]]
                    i+=1
                except ValueError:
                    continue

for root, dirs, files in os.walk(ncred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [1,data["text"],data["title"],data["date"],data["source"]]
                    i+=1
                except ValueError:
                    continue

In [728]:
articles.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,source,text,title
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,count,2086,2086,2086.0,2086
0.0,unique,44,11,1899.0,1898
0.0,top,04-05-2017,the-new-york-times,,Article 50
0.0,freq,67,221,22.0,9
1.0,count,4648,4648,4648.0,4648
1.0,unique,51,14,3758.0,3873
1.0,top,02-25-2017,activistpost,,"John McCain Illegally Travels To Syria, Meets ..."
1.0,freq,169,695,46.0,11


In [811]:
#Remove duplicate articles
print(len(articles))
unique_articles = articles.drop_duplicates(subset = 'text')
print(len(unique_articles))

6734
5656


In [812]:
#Remove really short articles (<=200 chars)
print(len(unique_articles))
unique_articles = unique_articles[unique_articles["text"].str.len()>200]
print(len(unique_articles))

5656
5521


In [731]:
Counter(unique_articles["source"])

Counter({'ItMakesSenseBlog': 152,
         'activistpost': 362,
         'bbc-news': 206,
         'bostonglobe': 111,
         'darkmoon': 14,
         'dcclothesline': 432,
         'empirenews': 13,
         'gopthedailydose': 546,
         'independent': 215,
         'infostormer': 152,
         'latimes': 151,
         'national-geographic': 171,
         'nature': 20,
         'reuters': 211,
         'rickwells': 352,
         'success-street': 299,
         'the-new-york-times': 151,
         'the-wall-street-journal': 215,
         'the-washington-post': 218,
         'usa-today': 218,
         'usanewsflash': 494,
         'usapoliticsnow': 428,
         'usasupreme': 323,
         'usfanzone': 67})

In [813]:
#cred_articles = unique_articles[unique_articles["label"]==0.0]
cred_articles = unique_articles[unique_articles["source"].isin(["new-york-times","the-new-york-times","reuters","the-wall-street-journal","the-washington-post","usa-today"])]
num_cred_articles = len(cred_articles)
print("Number of credible articles: {}".format(num_cred_articles))
#noncred_articles = unique_articles[unique_articles["label"]==1.0]
noncred_articles = unique_articles[unique_articles["source"].isin(["activistpost","dcclothesline","gopthedailydose","infostormer","rickwells","success-street","usanewsflash","usapoliticsnow","usasupreme"])]
print("Number of non-credible articles: {}".format(len(noncred_articles)))

Number of credible articles: 1013
Number of non-credible articles: 3388


Since we desire an even number of credible/non-credible articles in our training set, we will need to downsample our non-credible set. We can sample in a way such that the number of credible and non-credible articles are equal for each day that we've been collecting data. This eliminates the possibility of a temporal bias appearing in our training set by chance occurence.

In [814]:
cred_articles = cred_articles[~cred_articles["date"].isin(list(set(cred_articles["date"]) - set(noncred_articles["date"])))]

date_cnts = Counter(cred_articles["date"])
noncred_even = pd.DataFrame(columns=('label','text','title','date','source'))

for date in date_cnts:
    noncred_even = pd.concat([noncred_even, noncred_articles[noncred_articles["date"]==date].sample(n=date_cnts[date])])

In [815]:
even_articles = pd.concat([cred_articles, noncred_even])
print("Train/Test on {} articles".format(len(even_articles)))

Train/Test on 1930 articles


In [816]:
source_counts = Counter(even_articles["source"])
print(source_counts)

# plt.bar(range(len(source_counts)), source_counts.values(), align='center')
# plt.xticks(range(len(source_counts)), source_counts.keys())

# plt.show()

Counter({'the-washington-post': 208, 'usa-today': 208, 'the-wall-street-journal': 205, 'reuters': 201, 'usanewsflash': 154, 'gopthedailydose': 153, 'the-new-york-times': 143, 'usapoliticsnow': 125, 'dcclothesline': 113, 'rickwells': 103, 'usasupreme': 96, 'activistpost': 96, 'success-street': 93, 'infostormer': 32})


In [817]:
vocab = []
for text in even_articles["text"]:
    words = text.split()
    for word in words:
        vocab.append(word.lower())
print("Size of Vocabulary: {}".format(len(set(vocab))))        

Size of Vocabulary: 83469


## 2) Content-Based Classifier

### 2.0.1) MNB on Raw Text

In [737]:
even_articles["text"].head(20)

435    Wilbur Ross stands after being sworn in as Sec...
436    Samsung Group chief, Jay Y. Lee arrives at the...
437    A refugee walks along railway tracks from the ...
438    Handout photo provided to Reuters on February ...
439    U.S. President Donald Trump addresses Joint Se...
440    U.S. President Donald Trump looks up while att...
441    U.S. President Donald Trump attends a meeting ...
442    Injured people are assisted after an incident ...
443    WASHINGTON The Republican chairman of the U.S....
444    MILWAUKEE A police officer and three other peo...
445    A Texas law that requires voters to show ident...
446    FILE PHOTO -- Chief Executive Officer of Unite...
447    WASHINGTON The United States has made slight a...
448    REFILE -- CORRECTING TYPO -- A student who was...
449    A view of Alabama State Capital, where Alabama...
450    U.S. Navy guided-missile destroyer USS Porter ...
451    FILE PHOTOS: A combination of file photos show...
452    FILE PHOTO - A U.S. F18 

In [823]:
count_vect = CountVectorizer(analyzer='word', ngram_range=(1,1), min_df=0)
tfidf = TfidfTransformer()
    
#Perform cross validation
k_fold = KFold(n_splits=5, shuffle=True)
mnb_scores = []
svm_scores = []
mnb_f_scores=[]
svm_f_scores=[]
mnb_confusion = np.array([[0, 0], [0, 0]])
svm_confusion = np.array([[0, 0], [0, 0]])

for train_index, test_index in k_fold.split(even_articles):
    train_text = even_articles.iloc[train_index]['text'].values
    train_counts = count_vect.fit_transform(train_text)
    train_tfidf = tfidf.fit_transform(train_counts)
    train_y = even_articles.iloc[train_index]['label'].values
    
    test_text = even_articles.iloc[test_index]['text'].values
    test_counts = count_vect.transform(test_text)
    test_tfidf = tfidf.transform(test_counts)
    test_y = even_articles.iloc[test_index]['label'].values
       
    #MNB CLASSIFIER
    mnb_clf = MultinomialNB().fit(train_tfidf, train_y)
    mnb_predictions = mnb_clf.predict(test_tfidf)

    mnb_confusion += confusion_matrix(test_y, mnb_predictions)
    mnb_f_score = f1_score(test_y, mnb_predictions)
    mnb_score = accuracy_score(test_y, mnb_predictions)
    mnb_scores.append(mnb_score)
    mnb_f_scores.append(mnb_f_score)
    
    #SVM CLASSIFIER
    svm_clf = SVC(kernel="linear").fit(train_tfidf, train_y)
    svm_predictions = svm_clf.predict(test_tfidf)

    svm_confusion += confusion_matrix(test_y, svm_predictions)
    svm_f_score = f1_score(test_y, svm_predictions)
    svm_score = accuracy_score(test_y, svm_predictions)
    svm_scores.append(svm_score)
    svm_f_scores.append(svm_f_score)

print('[MNB] -- Cross Validation Metrics')
print('Total articles classified:', len(train_index) + len(test_index))
print('Accuracy Score:', round(sum(mnb_scores)/len(mnb_scores),3))
print('F1 Score:', round(sum(mnb_f_scores)/len(mnb_f_scores),3))
print('Confusion matrix:')
print(mnb_confusion)
print()
print('[SVM] -- Cross Validation Metrics')
print('Total articles classified:', len(train_index) + len(test_index))
print('Accuracy Score:', round(sum(svm_scores)/len(svm_scores),3))
print('F1 Score:', round(sum(svm_f_scores)/len(svm_f_scores),3))
print('Confusion matrix:')
print(svm_confusion)

[MNB] -- Cross Validation Metrics
Total articles classified: 1930
Accuracy Score: 0.923
F1 Score: 0.922
Confusion matrix:
[[897  68]
 [ 81 884]]

[SVM] -- Cross Validation Metrics
Total articles classified: 1930
Accuracy Score: 0.953
F1 Score: 0.953
Confusion matrix:
[[929  36]
 [ 54 911]]


In [824]:
#Function returns log_prob_1 - log_prob_0 for each word in corpus & sorts by max (most predictive nc feats)/min (most predictive c feats)
def show_most_predictive_feats(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    features = pd.DataFrame()
    features["words"] = feature_names
    features["log_prob_0"] = clf.feature_log_prob_[0]
    features["log_prob_1"] = clf.feature_log_prob_[1]
    features["log_prob_diff"] = features["log_prob_1"] - features["log_prob_0"]
    features = features.drop('log_prob_0', 1).drop('log_prob_1', 1)
    features_c_sort = features.sort_values(by=["log_prob_diff"])
    features_nc_sort = features.sort_values(by=["log_prob_diff"], ascending=False)
    print("\t  *Credible Features*")
    print("\t", features_c_sort.head(n).to_string(index=False, header=False, col_space=15))
    print()
    print("\t*Non-Credible Features*")
    print("\t  ", features_nc_sort.head(n).to_string(index=False, header=False, col_space=15))
    
show_most_informative_feats(count_vect, mnb_clf, n=10)
print()
show_most_predictive_feats(count_vect, mnb_clf, n=10)

		*PREDICTION = 0* 			*PREDICTION = 1*
	-5.412631593601138	the		-5.432718657823109	the
	-6.14521640755403	to		-6.0837805396029205	to
	-6.291272545617064	of		-6.206127414723071	and
	-6.387079841852576	and		-6.2942655000975805	of
	-6.461800972719331	in		-6.567436289237195	that
	-6.8569461337935	that		-6.63820933336043	in
	-6.954092340490268	trump		-6.806882763508469	is
	-6.966637753289567	said		-7.033062190392557	trump
	-7.022346832511469	on		-7.064768473902584	for
	-7.15281376701698	for		-7.075807765224125	he

	  *Credible Features*
	 advertisement       -2.708488
       reuters       -2.314191
          main       -2.228524
          skip       -2.212761
            mr       -1.985877
         photo       -1.922435
           inc       -1.644990
         embed       -1.621755
       editing       -1.498180
           wsj       -1.405399

	*Non-Credible Features*
	   rickrwells        2.382595
           www        2.152644
         gowdy        1.950578
           com        1.942618
 

### 2.1) Filter Text

In [825]:
#Remove words with length <=2 from text
def remove_shortwords(in_string):
    out_string = in_string
    out_words = out_string.split()
    out_words = [word for word in out_words if len(word) > 2]
    out_string = ' '.join(word for word in out_words)
    return(out_string)

#Remove words that shouldn't have the predictive power they are showing (regularization step only necessary for training)
def remove_overfit_words(in_string, wordlist, sourcelist, phraselist):
    out_string = in_string
    for phrase in phraselist:
        out_string = out_string.replace(phrase, '')
    for source in sourcelist:
        out_string = out_string.replace(source, '')
    out_words = out_string.split()
    out_words = [word for word in out_words if word not in wordlist]
    out_string = ' '.join(word for word in out_words)
    return(out_string)

#Load in list of overfit words & phrases from training sources
with open("text_redactions.txt", "r") as infile:
    wordlist = []
    for line in infile:
        wordlist.append(line.replace('\n',''))

#Generate sources list
sources = list(set(even_articles['source']))
sourcelist = [source.replace('-', ' ') for source in sources]
sourcelist.append('rickrwells')
sourcelist.append('rickwells')
sourcelist.append('rick wells')
sourcelist.append('wall street journal')
sourcelist.append('gop the daily dose')
sourcelist.append('new york times')
sourcelist.append('washington post')
sourcelist.append('activist post')

#Generate indicative phrase list from training sources
phraselist = ["Share this:",
              "by usapoliticsnow admin",
              "Our Standards: The Thomson Reuters Trust Principles",
              "Don't forget to follow the D.C. Clothesline on Facebook and Twitter. PLEASE help spread the word by sharing our articles on your favorite social networks.",
              "Share With Your Friends On Facebook, Twitter, Everywhere",
              "Thank you for reading and sharing my work –  Please look for me, Rick Wells, at http://www.facebook.com/RickRWells/ , http://www.gab.ai/RickRWells , https://plus.google.com/u/0/+RickwellsUs and on my website http://RickWells.US  – Please SUBSCRIBE in the right sidebar at RickWells.US – not dot com.  I’m also at Stop The Takeover, https://www.facebook.com/StopTheTakeover/ and please follow me on Twitter @RickRWells. Subscribe also on my YouTube Channel."
              "Like this Article? Share it!",
              "Do you have information the public should know? Here are some ways you can securely send information and documents to Post journalists.",
              "Share news tips with us confidentially",
             "Share on Facebook",
             "Tweet on Twitter",
             "We encourage you to share and republish our reports, analyses, breaking news and videos (Click for details).",
             "Next post",
             "Previous post"]

even_articles['filtered_text'] = even_articles.apply(lambda x: remove_overfit_words(x['text'], wordlist=wordlist, sourcelist=sourcelist, phraselist=phraselist), axis=1)
even_articles['filtered_text'] = even_articles['filtered_text'].apply(remove_shortwords)

In [796]:
even_articles['filtered_text'].head(20)

435    Wilbur Ross stands after being sworn Secretary...
436    Samsung Group chief, Jay Lee arrives the offic...
437    refugee walks along railway tracks from the Un...
438    Handout provided Reuters February 13, 2017, Hu...
439    U.S. President Donald Trump addresses Joint Se...
440    U.S. President Donald Trump looks while attend...
441    U.S. President Donald Trump attends meeting wi...
442    Injured people are assisted after incident Wes...
443    WASHINGTON The Republican chairman the U.S. Ho...
444    MILWAUKEE police officer and three other peopl...
445    Texas law that requires voters show identifica...
446    FILE PHOTO Chief Executive Officer United Airl...
447    WASHINGTON The United States has made slight a...
448    REFILE CORRECTING TYPO student who was evacuat...
449    view Alabama State Capital, where Alabama Gove...
450    U.S. Navy guided-missile destroyer USS Porter ...
451    FILE PHOTOS: combination file photos show U.S....
452    FILE PHOTO U.S. F18 figh

### 2.1.1 MNB & SVM classifier on "filtered_text"

In [821]:
count_vect = CountVectorizer(analyzer='word', ngram_range=(1,1), min_df=0, lowercase=True, stop_words='english')
tfidf = TfidfTransformer()

#Perform cross validation
k_fold = KFold(n_splits=5, shuffle=True)
mnb_scores = []
svm_scores = []
mnb_f_scores=[]
svm_f_scores=[]
mnb_confusion = np.array([[0, 0], [0, 0]])
svm_confusion = np.array([[0, 0], [0, 0]])

for train_index, test_index in k_fold.split(even_articles):
    train_text = even_articles.iloc[train_index]['filtered_text'].values
    train_counts = count_vect.fit_transform(train_text)
    train_tfidf = tfidf.fit_transform(train_counts)
    train_y = even_articles.iloc[train_index]['label'].values

    test_text = even_articles.iloc[test_index]['filtered_text'].values
    test_counts = count_vect.transform(test_text)
    test_tfidf = tfidf.transform(test_counts)
    test_y = even_articles.iloc[test_index]['label'].values

    #MNB CLASSIFIER
    mnb_clf = MultinomialNB().fit(train_tfidf, train_y)
    mnb_predictions = mnb_clf.predict(test_tfidf)

    mnb_confusion += confusion_matrix(test_y, mnb_predictions)
    mnb_f_score = f1_score(test_y, mnb_predictions)
    mnb_score = accuracy_score(test_y, mnb_predictions)
    mnb_scores.append(mnb_score)
    mnb_f_scores.append(mnb_f_score)

    #SVM CLASSIFIER - LINEAR KERNEL
    svm_clf = SVC(kernel="linear").fit(train_tfidf, train_y)
    svm_predictions = svm_clf.predict(test_tfidf)

    svm_confusion += confusion_matrix(test_y, svm_predictions)
    svm_f_score = f1_score(test_y, svm_predictions)
    svm_score = accuracy_score(test_y, svm_predictions)
    svm_scores.append(svm_score)
    svm_f_scores.append(svm_f_score)

print('[MNB] -- Cross Validation Metrics')
print('Total articles classified:', len(train_index) + len(test_index))
print('Accuracy Score:', round(sum(mnb_scores)/len(mnb_scores),3))
print('F1 Score:', round(sum(mnb_f_scores)/len(mnb_f_scores),3))
print('Confusion matrix:')
print(mnb_confusion)
print()
print('[SVM-Linear] -- Cross Validation Metrics')
print('Total articles classified:', len(train_index) + len(test_index))
print('Accuracy Score:', round(sum(svm_scores)/len(svm_scores),3))
print('F1 Score:', round(sum(svm_f_scores)/len(svm_f_scores),3))
print('Confusion matrix:')
print(svm_confusion)

[MNB] -- Cross Validation Metrics
Total articles classified: 1930
Accuracy Score: 0.906
F1 Score: 0.901
Confusion matrix:
[[927  38]
 [143 822]]

[SVM-Linear] -- Cross Validation Metrics
Total articles classified: 1930
Accuracy Score: 0.948
F1 Score: 0.947
Confusion matrix:
[[923  42]
 [ 59 906]]


In [822]:
show_most_predictive_feats(count_vect, mnb_clf, n=100)

	  *Credible Features*
	 advertisement       -2.891066
       reuters       -2.356618
         photo       -2.345384
          skip       -2.269950
            mr       -2.019556
         embed       -1.680976
       editing       -1.580484
     lawmakers       -1.504561
         getty       -1.471699
      continue       -1.465324
       bentley       -1.439586
        bannon       -1.434518
    affordable       -1.434251
          usat       -1.405379
           gop       -1.388292
      medicaid       -1.345687
           wsj       -1.336679
            ms       -1.325515
            ly       -1.321291
        budget       -1.294524
    washington       -1.287409
      mulvaney       -1.276039
    republican       -1.237784
           cbo       -1.229778
        friday       -1.229068
     tillerson       -1.223693
          care       -1.222679
      insurers       -1.221202
        health       -1.219110
     humankind       -1.218277
     wednesday       -1.211277
          uber 

### 2.5) Perform stemming on words

In [638]:
#Perform stemming on words
def stem_words(in_string):
    snowball = nltk.stem.SnowballStemmer('english')
    out_string = in_string
    out_words = out_string.split()
    out_words = [snowball.stem(word) for word in out_words]
    out_string = ' '.join(word for word in out_words)
    return(out_string)

even_articles['stem_text'] = even_articles['filtered_text'].apply(stem_words)

In [414]:
even_articles['stem_text'].head(20)

0     view uncrew dragon capsul make journey intern ...
1     watch impati eleph disobey railway rule young ...
2     view advanc laser imag techniqu reveal new det...
3     watch visitor badal wildlif park break rule ro...
4     view particip slav vike festiv wolin poland te...
5     nuclear bomb help fight eleph poach radioact c...
6     view conserv clean surfac stone slab vener fin...
7     ancient bug attract mate rare amber find tell ...
8     sprawl million acr alaska nation wildlif refug...
9     view rusti patch bumblebe bombus affini first ...
10    giant deepsea octopus devour jellyfish—and kee...
11    view new speci long confus close relat pristim...
12    view secretari inspect newli print magazin chi...
13    view white rhino graze ranch belong john hume ...
14    long european settler establish commerci seal ...
15    watch sea creatur shimmer disappear eye thousa...
16    view phd candid hydrologist nathan reaver dive...
17    watch miss man found dead insid python war

In [639]:
count_vect = CountVectorizer(analyzer='word', ngram_range=(1,1), min_df=0)
tfidf = TfidfTransformer()

#Perform cross validation
k_fold = KFold(n_splits=5, shuffle=True)
mnb_scores = []
svm_scores = []
mnb_f_scores=[]
svm_f_scores=[]
mnb_confusion = np.array([[0, 0], [0, 0]])
svm_confusion = np.array([[0, 0], [0, 0]])

for train_index, test_index in k_fold.split(even_articles):
    train_text = even_articles.iloc[train_index]['stem_text'].values
    train_counts = count_vect.fit_transform(train_text)
    train_tfidf = tfidf.fit_transform(train_counts)
    train_y = even_articles.iloc[train_index]['label'].values

    test_text = even_articles.iloc[test_index]['stem_text'].values
    test_counts = count_vect.transform(test_text)
    test_tfidf = tfidf.transform(test_counts)
    test_y = even_articles.iloc[test_index]['label'].values

    #MNB CLASSIFIER
    mnb_clf = MultinomialNB().fit(train_tfidf, train_y)
    mnb_predictions = mnb_clf.predict(test_tfidf)

    mnb_confusion += confusion_matrix(test_y, mnb_predictions)
    mnb_f_score = f1_score(test_y, mnb_predictions)
    mnb_score = accuracy_score(test_y, mnb_predictions)
    mnb_scores.append(mnb_score)
    mnb_f_scores.append(mnb_f_score)

    #SVM CLASSIFIER - LINEAR KERNEL
    svm_clf = SVC(kernel="linear").fit(train_tfidf, train_y)
    svm_predictions = svm_clf.predict(test_tfidf)

    svm_confusion += confusion_matrix(test_y, svm_predictions)
    svm_f_score = f1_score(test_y, svm_predictions)
    svm_score = accuracy_score(test_y, svm_predictions)
    svm_scores.append(svm_score)
    svm_f_scores.append(svm_f_score)

print('[MNB] -- Cross Validation Metrics')
print('Total articles classified:', len(train_index) + len(test_index))
print('Accuracy Score:', round(sum(mnb_scores)/len(mnb_scores),3))
print('F1 Score:', round(sum(mnb_f_scores)/len(mnb_f_scores),3))
print('Confusion matrix:')
print(mnb_confusion)
print()
print('[SVM-Linear] -- Cross Validation Metrics')
print('Total articles classified:', len(train_index) + len(test_index))
print('Accuracy Score:', round(sum(svm_scores)/len(svm_scores),3))
print('F1 Score:', round(sum(svm_f_scores)/len(svm_f_scores),3))
print('Confusion matrix:')
print(svm_confusion)

[MNB] -- Cross Validation Metrics
Total articles classified: 3774
Accuracy Score: 0.912
F1 Score: 0.91
Confusion matrix:
[[1762  125]
 [ 208 1679]]

[SVM-Linear] -- Cross Validation Metrics
Total articles classified: 3774
Accuracy Score: 0.946
F1 Score: 0.946
Confusion matrix:
[[1818   69]
 [ 133 1754]]


In [460]:
show_most_informative_feats(count_vect, mnb_clf, n=10)
print()
show_most_predictive_feats(count_vect, mnb_clf, n=10)

		*PREDICTION = 0* 			*PREDICTION = 1*
	-6.581637671340962	said		-6.439528570241916	trump
	-6.674797391301648	trump		-6.863770423825693	presid
	-7.20902222634294	hous		-6.930768477843589	obama
	-7.257771146422305	presid		-7.416548171421603	state
	-7.410192441003275	would		-7.494673039701967	said
	-7.502050106489097	republican		-7.502306057801121	share
	-7.651935008041935	white		-7.512699854649074	peopl
	-7.707707172027965	peopl		-7.596972320112258	one
	-7.76349289903372	year		-7.6083777303958495	it
	-7.773412442952562	state		-7.640525240428502	american

	  *Credible Features*
	 brexit       -2.290469
     westminst       -1.980606
          skip       -1.948934
       theresa       -1.875615
           nhs       -1.796979
        labour       -1.795167
    parliament       -1.776802
        dodger       -1.763648
       britain       -1.728965
         korea       -1.708322

	*Non-Credible Features*
	   everywher        2.356065
      facebook        2.010376
          soro        1.97

## 3. "Tonal" Classifier

### 3.1) Sentiment Analysis

In [462]:
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [473]:
even_articles['sentences'] = even_articles['text'].apply(split_into_sentences)

In [616]:
def sent_analysis(text, uoa="sentences"):
    if uoa == "sentences":
        sid = SentimentIntensityAnalyzer()
        counter=0
        total_compound=0
        for sentence in text:
            ss = sid.polarity_scores(sentence)
            total_compound = total_compound + ss['compound']
            counter+=1

        if counter==0:
            avg_compound=0
        else:
            avg_compound = total_compound/counter

        return(avg_compound)
    
    elif uoa == "string":
        filtered_text = remove_cap_punc(text)
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(filtered_text)
        compound = ss['compound']
        return(compound)
    else:
        print("uoa (unit of analysis) not recognized")        

In [617]:
even_articles['text_sentiment'] = even_articles['sentences'].apply(sent_analysis)
even_articles['title_sentiment'] = even_articles.apply(lambda x: sent_analysis(x['title'], uoa="string"), axis=1)

In [618]:
print('Credible avg. sentiment score on text:', np.mean(even_articles['text_sentiment'][even_articles['label']==0]), '+/-', 2*np.std(even_articles['text_sentiment'][even_articles['label']==0]))
print('Non-Credible avg. sentiment score on text:', np.mean(even_articles['text_sentiment'][even_articles['label']==1]), '+/-', 2*np.std(even_articles['text_sentiment'][even_articles['label']==1]))
print()
print('Credible avg. sentiment score on title:', np.mean(even_articles['title_sentiment'][even_articles['label']==0]), '+/-', 2*np.std(even_articles['title_sentiment'][even_articles['label']==0]))
print('Non-Credible avg. sentiment score on title:', np.mean(even_articles['title_sentiment'][even_articles['label']==1]), '+/-', 2*np.std(even_articles['title_sentiment'][even_articles['label']==1]))

Credible avg. sentiment score on text: 0.014421412894685223 +/- 0.3383581262430465
Non-Credible avg. sentiment score on text: -0.012806591275834053 +/- 0.3476911003027185

Credible avg. sentiment score on title: -0.10641037911746429 +/- 0.7840593273817753
Non-Credible avg. sentiment score on title: -0.17674953387197012 +/- 0.8061239586588453


In [619]:
#Perform cross validation for logistic regression
k_fold = KFold(n_splits=5, shuffle=True)
scores = []
f_scores=[]
confusion = np.array([[0, 0], [0, 0]])
for train_index, test_index in k_fold.split(even_articles):
    train_ld = even_articles.iloc[train_index]['title_sentiment'].values
    train_y = even_articles.iloc[train_index]['label'].values
    
    test_ld = even_articles.iloc[test_index]['title_sentiment'].values
    test_y = even_articles.iloc[test_index]['label'].values

    clf = LogisticRegression().fit(train_ld.reshape(-1, 1), train_y)
    predictions = clf.predict(test_ld.reshape(-1, 1))

    confusion += confusion_matrix(test_y, predictions)
    f_score = f1_score(test_y, predictions)
    score = accuracy_score(test_y, predictions)
    scores.append(score)
    f_scores.append(f_score)

print('Cross Validation (LogisticRegression) Metrics')
print('Accuracy Score:', round(sum(scores)/len(scores),3))
print('F1 Score:', round(sum(f_scores)/len(f_scores),3))
print('Confusion matrix:')
print(confusion)

Cross Validation (LogisticRegression) Metrics
Accuracy Score: 0.539
F1 Score: 0.5
Confusion matrix:
[[993 616]
 [867 742]]


### 3.2) Punctuation Usage - Specifically "?" & "!"

In [513]:
#% of characters in title that are "?" or "!"
def pct_char_quesexcl(title):
    try:
        ques_excl = [char for char in title if char=='?' or char=='!']
        return(len(ques_excl)/len(title))
    except:
        return(0)        

#% of punctuation in text that is "?" or "!"
def pct_punct_quesexcl(in_string):
    try:
        punct = [char for char in in_string if char in string.punctuation]
        ques_excl = [p for p in punct if p=='?' or p=='!']
        return(len(ques_excl)/len(punct))
    except:
        return(0)

even_articles['pct_char_quesexcl_title'] = even_articles['title'].apply(pct_char_quesexcl)
even_articles['pct_punc_quesexcl_text'] = even_articles['text'].apply(pct_punct_quesexcl)

In [514]:
print('Credible % of punctuation that is "!" or "?":', np.mean(even_articles['pct_punc_quesexcl_text'][even_articles['label']==0]), '+/-', 2*np.std(even_articles['pct_punc_quesexcl_text'][even_articles['label']==0]))
print('Non-Credible % of punctuation that is "!" or "?":', np.mean(even_articles['pct_punc_quesexcl_text'][even_articles['label']==1]), '+/-', 2*np.std(even_articles['pct_punc_quesexcl_text'][even_articles['label']==1]))
print()
print('Credible % of characters in title that is "!" or "?":', np.mean(even_articles['pct_char_quesexcl_title'][even_articles['label']==0]), '+/-', 2*np.std(even_articles['pct_char_quesexcl_title'][even_articles['label']==0]))
print('Non-Credible % of characters in title that is "!" or "?":', np.mean(even_articles['pct_char_quesexcl_title'][even_articles['label']==1]), '+/-', 2*np.std(even_articles['pct_char_quesexcl_title'][even_articles['label']==1]))

Credible % of punctuation that is "!" or "?": 0.007873060526293822 +/- 0.03709217584514235
Non-Credible % of punctuation that is "!" or "?": 0.049374923598625595 +/- 0.14538822720121042

Credible % of characters in title that is "!" or "?": 0.0006084337081119179 +/- 0.007245049650615242
Non-Credible % of characters in title that is "!" or "?": 0.00541318681897949 +/- 0.02228699359987052


In [523]:
#Perform cross validation for logistic regression
k_fold = KFold(n_splits=5, shuffle=True)
scores = []
f_scores=[]
confusion = np.array([[0, 0], [0, 0]])
for train_index, test_index in k_fold.split(even_articles):
    train_pct = even_articles.iloc[train_index]['pct_char_quesexcl_title'].values
    train_y = even_articles.iloc[train_index]['label'].values
    
    test_pct = even_articles.iloc[test_index]['pct_char_quesexcl_title'].values
    test_y = even_articles.iloc[test_index]['label'].values

    clf = LogisticRegression().fit(train_pct.reshape(-1, 1), train_y)
    predictions = clf.predict(test_pct.reshape(-1, 1))

    confusion += confusion_matrix(test_y, predictions)
    f_score = f1_score(test_y, predictions)
    score = accuracy_score(test_y, predictions)
    scores.append(score)
    f_scores.append(f_score)

print('Cross Validation (LogisticRegression) Metrics')
print('Accuracy Score:', round(sum(scores)/len(scores),3))
print('F1 Score:', round(sum(f_scores)/len(f_scores),3))
print('Confusion matrix:')
print(confusion)

Cross Validation (LogisticRegression) Metrics
Accuracy Score: 0.541
F1 Score: 0.468
Confusion matrix:
[[910 699]
 [778 831]]


### 3.3) % words ALL CAPS in title

In [518]:
def pct_allcaps(title):
    try:
        translator = str.maketrans('', '', string.punctuation)
        title = title.translate(translator)
        words = title.split()
        all_caps = [word for word in words if word.isupper()]
        return(len(all_caps)/len(words))
    except:
        return(0)        

even_articles['pct_allcaps_title'] = even_articles['title'].apply(pct_allcaps)

In [519]:
print('Credible % ALL CAPS words in title:', np.mean(even_articles['pct_allcaps_title'][even_articles['label']==0]), '+/-', 2*np.std(even_articles['pct_allcaps_title'][even_articles['label']==0]))
print('Non-Credible % ALL CAPS words in title:', np.mean(even_articles['pct_allcaps_title'][even_articles['label']==1]), '+/-', 2*np.std(even_articles['pct_allcaps_title'][even_articles['label']==1]))

Credible % ALL CAPS words in title: 0.02495169303352554 +/- 0.09646197873766794
Non-Credible % ALL CAPS words in title: 0.1268097622440157 +/- 0.40129361119726975


In [520]:
#Perform cross validation for logistic regression
k_fold = KFold(n_splits=5, shuffle=True)
scores = []
f_scores=[]
confusion = np.array([[0, 0], [0, 0]])
for train_index, test_index in k_fold.split(even_articles):
    train_pct = even_articles.iloc[train_index]['pct_allcaps_title'].values
    train_y = even_articles.iloc[train_index]['label'].values
    
    test_pct = even_articles.iloc[test_index]['pct_allcaps_title'].values
    test_y = even_articles.iloc[test_index]['label'].values

    clf = LogisticRegression().fit(train_pct.reshape(-1, 1), train_y)
    predictions = clf.predict(test_pct.reshape(-1, 1))

    confusion += confusion_matrix(test_y, predictions)
    f_score = f1_score(test_y, predictions)
    score = accuracy_score(test_y, predictions)
    scores.append(score)
    f_scores.append(f_score)

print('Cross Validation (LogisticRegression) Metrics')
print('Accuracy Score:', round(sum(scores)/len(scores),3))
print('F1 Score:', round(sum(f_scores)/len(f_scores),3))
print('Confusion matrix:')
print(confusion)

Cross Validation (LogisticRegression) Metrics
Accuracy Score: 0.673
F1 Score: 0.638
Confusion matrix:
[[1239  370]
 [ 682  927]]


### 3.3.1) Logistic Regression & XGBoost classifier on derived "tonal" features

In [296]:
#Define function to reshape numpy array
def reshape_array(array):
    flipped_array = array.T
    return(flipped_array)

In [573]:
#Perform cross validation for logistic regression, XGBoost
k_fold = KFold(n_splits=5, shuffle=True)

lr_scores=[]
xgb_scores=[]
lr_f_scores=[]
xgb_f_scores=[]
lr_confusion = np.array([[0, 0], [0, 0]])
xgb_confusion = np.array([[0, 0], [0, 0]])
for train_index, test_index in k_fold.split(even_articles):
    train_x = np.array([even_articles.iloc[train_index]['pct_allcaps_title'].values,
                        even_articles.iloc[train_index]['pct_punc_quesexcl_text'].values,
                        even_articles.iloc[train_index]['pct_char_quesexcl_title'].values,
                        even_articles.iloc[train_index]['text_sentiment'].values,
                        even_articles.iloc[train_index]['title_sentiment'].values])
    train_y = even_articles.iloc[train_index]['label'].values

    test_x = np.array([even_articles.iloc[test_index]['pct_allcaps_title'].values,
                       even_articles.iloc[test_index]['pct_punc_quesexcl_text'].values,
                       even_articles.iloc[test_index]['pct_char_quesexcl_title'].values,
                       even_articles.iloc[test_index]['text_sentiment'].values,
                       even_articles.iloc[test_index]['title_sentiment'].values])
    test_y = even_articles.iloc[test_index]['label'].values

    #LOGISTIC REGRESSION
    lr_clf = LogisticRegression().fit(reshape_array(train_x), train_y)
    lr_predictions = lr_clf.predict(reshape_array(test_x))

    lr_confusion += confusion_matrix(test_y, lr_predictions)
    lr_f_score = f1_score(test_y, lr_predictions)
    lr_score = accuracy_score(test_y, lr_predictions)
    lr_scores.append(lr_score)
    lr_f_scores.append(lr_f_score)

    #XGBOOST
    xgb_clf = XGBClassifier(max_depth=3, n_estimators=100).fit(reshape_array(train_x), train_y)
    xgb_predictions = xgb_clf.predict(reshape_array(test_x))

    xgb_confusion += confusion_matrix(test_y, xgb_predictions)
    xgb_f_score = f1_score(test_y, xgb_predictions)
    xgb_score = accuracy_score(test_y, xgb_predictions)
    xgb_scores.append(xgb_score)
    xgb_f_scores.append(xgb_f_score)


print('Cross Validation (LogisticRegression) Metrics')
print('Accuracy Score:', round(sum(lr_scores)/len(lr_scores),3))
print('F1 Score:', round(sum(lr_f_scores)/len(lr_f_scores),3))
print('Confusion matrix:')
print(lr_confusion)
print()
print('Cross Validation (XGBoost) Metrics')
print('Accuracy Score:', round(sum(xgb_scores)/len(xgb_scores),3))
print('F1 Score:', round(sum(xgb_f_scores)/len(xgb_f_scores),3))
print('Confusion matrix:')
print(xgb_confusion)

Cross Validation (LogisticRegression) Metrics
Accuracy Score: 0.721
F1 Score: 0.688
Confusion matrix:
[[1333  276]
 [ 621  988]]

Cross Validation (XGBoost) Metrics
Accuracy Score: 0.769
F1 Score: 0.748
Confusion matrix:
[[1369  240]
 [ 502 1107]]


## 4) Check for Generalization

In [574]:
cred_fp = '/ebs_volume/data/Credible/'
ncred_fp = '/ebs_volume/data/notCredible/'

articles = pd.DataFrame(columns=('label',
                                 'text',
                                 'title',
                                 'date',
                                 'source'))
i = 0    
for root, dirs, files in os.walk(cred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [0,data["text"],data["title"],data["date"],data["source"]]
                    i+=1
                except ValueError:
                    continue

for root, dirs, files in os.walk(ncred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [1,data["text"],data["title"],data["date"],data["source"]]
                    i+=1
                except ValueError:
                    continue

In [575]:
#Remove duplicate articles
print(len(articles))
unique_articles = articles.drop_duplicates(subset = 'text')
print(len(unique_articles))

6235
5182


In [576]:
#Remove really short articles (<=200 chars)
print(len(unique_articles))
unique_articles = unique_articles[unique_articles["text"].str.len()>200]
print(len(unique_articles))

5182
5048


In [577]:
all_creds = unique_articles[unique_articles["label"]==0.0]
all_noncreds = unique_articles[unique_articles["label"]==1.0]

In [578]:
credible_sources = list(set(unique_articles["source"][unique_articles["label"]==0]))
non_credible_sources = list(set(unique_articles["source"][unique_articles["label"]==1]))

In [579]:
#Remove sources that don't contain enough articles for testing
credible_sources.remove('new-york-times')
credible_sources.remove('nature')
non_credible_sources.remove('empirenews')
non_credible_sources.remove('darkmoon')

In [580]:
#Shuffle lists and divide in 5 equal(ish) parts
random.shuffle(credible_sources)
random.shuffle(non_credible_sources)
credible_sources_array=np.array(credible_sources)
non_credible_sources_array=np.array(non_credible_sources)

credible_sources_arrays = np.split(credible_sources_array, [2, 4, 6, 8, 10])
non_credible_sources_arrays = np.split(non_credible_sources_array, [3, 5, 7, 9, 11])

In [581]:
credible_sources_arrays = credible_sources_arrays[:5]
non_credible_sources_arrays = non_credible_sources_arrays[:5]

In [623]:
scores_mnb = []
f_scores_mnb = []
confusion_mnb = np.array([[0, 0], [0, 0]])

scores_svm = []
f_scores_svm = []
confusion_svm = np.array([[0, 0], [0, 0]])

scores_xgb = []
f_scores_xgb = []
confusion_xgb = np.array([[0, 0], [0, 0]])

i=0

for cred_array in credible_sources_arrays:
    #1) Generate Train/Test splits by source
    cred_list = list(cred_array)    
    holdout_creds = all_creds[all_creds["source"].isin(cred_list)]
    train_creds = all_creds[~all_creds["source"].isin(cred_list)]
    
    for non_cred_array in non_credible_sources_arrays:
        non_cred_list = list(non_cred_array)
        holdout_noncreds = all_noncreds[all_noncreds["source"].isin(non_cred_list)].sample(n=len(holdout_creds))
        train_noncreds = all_noncreds[~all_noncreds["source"].isin(non_cred_list)].sample(n=len(train_creds))
        
        train_articles = pd.concat([train_creds, train_noncreds])
        test_articles = pd.concat([holdout_creds, holdout_noncreds])
        
        #2) Text preprocessing for bag of words (content-based) classifiers
        train_articles['filtered_text'] = train_articles['text'].apply(remove_cap_punc)
        test_articles['filtered_text'] = test_articles['text'].apply(remove_cap_punc)
        
        train_articles['filtered_text'] = train_articles.apply(lambda x: remove_overfit_words(x['filtered_text'], wordlist=wordlist, sourcelist=sourcelist, phraselist=phraselist), axis=1)
        test_articles['filtered_text'] = test_articles.apply(lambda x: remove_overfit_words(x['filtered_text'], wordlist=wordlist, sourcelist=sourcelist, phraselist=phraselist), axis=1)
        
        train_articles['filtered_text'] = train_articles['filtered_text'].apply(remove_shortwords)
        test_articles['filtered_text'] = test_articles['filtered_text'].apply(remove_shortwords)
        
        train_articles['filtered_text'] = train_articles['filtered_text'].apply(remove_stopwords)
        train_articles['filtered_text'] = train_articles['filtered_text'].apply(remove_shortwords)
        test_articles['filtered_text'] = test_articles['filtered_text'].apply(remove_stopwords)
        test_articles['filtered_text'] = test_articles['filtered_text'].apply(remove_shortwords)
        
        #3) MNB classification
        count_vect = CountVectorizer(analyzer='word', ngram_range=(1,1), min_df=0)
        tfidf = TfidfTransformer()
    
        confusion = np.array([[0, 0], [0, 0]])
        train_text = train_articles['filtered_text'].values
        train_counts = count_vect.fit_transform(train_text)
        train_tfidf = tfidf.fit_transform(train_counts)
        train_y = train_articles['label'].values

        test_text = test_articles['filtered_text'].values
        test_counts = count_vect.transform(test_text)
        test_tfidf = tfidf.transform(test_counts)
        test_y = test_articles['label'].values

        mnb_clf = MultinomialNB().fit(train_tfidf, train_y)
        mnb_predictions = mnb_clf.predict(test_tfidf)

        confusion_mnb += confusion_matrix(test_y, mnb_predictions)
        f_score_mnb = f1_score(test_y, mnb_predictions)
        score_mnb = accuracy_score(test_y, mnb_predictions)
        scores_mnb.append(score_mnb)
        f_scores_mnb.append(f_score_mnb)
        
        print("[MNB] -- Test on {0} & {1}".format(cred_list, non_cred_list))
        print('Total articles classified:', len(mnb_predictions))
        print('Accuracy Score:', round(score_mnb, 3))
        print('F1 Score:', round(f_score_mnb, 3))
        print('Confusion matrix:')
        print(confusion_matrix(test_y, mnb_predictions))
        print()
        
        #4) Linear SVM Classification
        svm_clf = SVC(kernel="linear").fit(train_tfidf, train_y)
        svm_predictions = svm_clf.predict(test_tfidf)

        confusion_svm += confusion_matrix(test_y, svm_predictions)
        f_score_svm = f1_score(test_y, svm_predictions)
        score_svm = accuracy_score(test_y, svm_predictions)
        scores_svm.append(score_svm)
        f_scores_svm.append(f_score_svm)
        
        print("[SVM] -- Test on {0} & {1}".format(cred_list, non_cred_list))
        print('Total articles classified:', len(svm_predictions))
        print('Accuracy Score:', round(score_svm, 3))
        print('F1 Score:', round(f_score_svm, 3))
        print('Confusion matrix:')
        print(confusion_matrix(test_y, svm_predictions))
        print()
        
        #5) Text preprocessing for tone-based classification
        train_articles['sentences'] = train_articles['text'].apply(split_into_sentences)
        train_articles['text_sentiment'] = train_articles['sentences'].apply(sent_analysis)
        train_articles['title_sentiment'] = train_articles.apply(lambda x: sent_analysis(x['title'], uoa="string"), axis=1)
        test_articles['sentences'] = test_articles['text'].apply(split_into_sentences)
        test_articles['text_sentiment'] = test_articles['sentences'].apply(sent_analysis)
        test_articles['title_sentiment'] = test_articles.apply(lambda x: sent_analysis(x['title'], uoa="string"), axis=1)
        
        train_articles['pct_char_quesexcl_title'] = train_articles['title'].apply(pct_char_quesexcl)
        train_articles['pct_punc_quesexcl_text'] = train_articles['text'].apply(pct_punct_quesexcl)
        test_articles['pct_char_quesexcl_title'] = test_articles['title'].apply(pct_char_quesexcl)
        test_articles['pct_punc_quesexcl_text'] = test_articles['text'].apply(pct_punct_quesexcl)
        
        train_articles['pct_allcaps_title'] = train_articles['title'].apply(pct_allcaps)
        test_articles['pct_allcaps_title'] = test_articles['title'].apply(pct_allcaps)
        
        #6) XGBoost Classification
        xgb_train_x = np.array([train_articles['pct_allcaps_title'].values,
                                train_articles['pct_punc_quesexcl_text'].values,
                                train_articles['pct_char_quesexcl_title'].values,
                                train_articles['text_sentiment'].values,
                                train_articles['title_sentiment'].values])
        xgb_train_y = train_articles['label'].values
        
        xgb_test_x = np.array([test_articles['pct_allcaps_title'].values,
                               test_articles['pct_punc_quesexcl_text'].values,
                               test_articles['pct_char_quesexcl_title'].values,
                               test_articles['text_sentiment'].values,
                               test_articles['title_sentiment'].values])
        xgb_test_y = test_articles['label'].values
        
        xgb_clf = XGBClassifier(max_depth=3, n_estimators=100).fit(reshape_array(xgb_train_x), xgb_train_y)
        xgb_predictions = xgb_clf.predict(reshape_array(xgb_test_x))

        confusion_xgb += confusion_matrix(xgb_test_y, xgb_predictions)
        f_score_xgb = f1_score(xgb_test_y, xgb_predictions)
        score_xgb = accuracy_score(xgb_test_y, xgb_predictions)
        scores_xgb.append(score_xgb)
        f_scores_xgb.append(f_score_xgb)
        
        print("[XGB] -- Test on {0} & {1}".format(cred_list, non_cred_list))
        print('Total articles classified:', len(xgb_predictions))
        print('Accuracy Score:', round(score_xgb, 3))
        print('F1 Score:', round(f_score_xgb, 3))
        print('Confusion matrix:')
        print(confusion_matrix(xgb_test_y, xgb_predictions))
        print()
                
        i+=1
        print("COMPLETED {0}/{1} ITERATIONS".format(i,len(credible_sources_arrays)*len(non_credible_sources_arrays)))
        print()
        
print("*---------------------------*")        
print('GENERALIZATION (MNB) Metrics')
print('Accuracy Score:', round(sum(scores_mnb)/len(scores_mnb),3))
print('F1 Score:', round(sum(f_scores_mnb)/len(f_scores_mnb),3))
print('Confusion matrix:')
print(confusion_mnb)
print()

print('GENERALIZATION (SVM) Metrics')
print('Accuracy Score:', round(sum(scores_svm)/len(scores_svm),3))
print('F1 Score:', round(sum(f_scores_svm)/len(f_scores_svm),3))
print('Confusion matrix:')
print(confusion_svm) 
print()

print('GENERALIZATION (XGB) Metrics')
print('Accuracy Score:', round(sum(scores_xgb)/len(scores_xgb),3))
print('F1 Score:', round(sum(f_scores_xgb)/len(f_scores_xgb),3))
print('Confusion matrix:')
print(confusion_xgb)

[MNB] -- Test on ['the-wall-street-journal', 'bbc-news'] & ['ItMakesSenseBlog', 'activistpost', 'infostormer']
Total articles classified: 762
Accuracy Score: 0.793
F1 Score: 0.744
Confusion matrix:
[[375   6]
 [152 229]]

[SVM] -- Test on ['the-wall-street-journal', 'bbc-news'] & ['ItMakesSenseBlog', 'activistpost', 'infostormer']
Total articles classified: 762
Accuracy Score: 0.833
F1 Score: 0.827
Confusion matrix:
[[332  49]
 [ 78 303]]

[XGB] -- Test on ['the-wall-street-journal', 'bbc-news'] & ['ItMakesSenseBlog', 'activistpost', 'infostormer']
Total articles classified: 762
Accuracy Score: 0.693
F1 Score: 0.619
Confusion matrix:
[[338  43]
 [191 190]]

COMPLETED 1/25 ITERATIONS

[MNB] -- Test on ['the-wall-street-journal', 'bbc-news'] & ['gopthedailydose', 'success-street']
Total articles classified: 762
Accuracy Score: 0.924
F1 Score: 0.919
Confusion matrix:
[[374   7]
 [ 51 330]]

[SVM] -- Test on ['the-wall-street-journal', 'bbc-news'] & ['gopthedailydose', 'success-street']
To

In [299]:
pizza_gate_text = """Comet Pizza is a pizza place owned by James Alefantis, who is the former gay boyfriend of David Brock, the CEO of Correct The Record. It has been the venue for dozens of events for the Hillary campaign staff. John Podesta has had campaign fundraisers there for both Barack Obama and Hillary Clinton. John’s brother and business partner Tony Podesta has his birthday party there every year. [https://i.sli.mg/1MqPHA.png]

It’s also a dive that according to reviews and photos has hidden bathroom doors and creepy murals. The bathrooms in particular have murals exclusively of nude women, as well as a great deal of graffiti relating to sex. Reviews of the restaurant are bizarrely polarized. Websites describing it positively note that there are regularly “unsupervised children running around”. Their menu include a pedophilic symbol, as do the signs and decorations of other neighboring businesses.

The music acts and the posters promoting same acts are bizarre in their presentation, content, and lyrical focus, but are still promoted as being “for all ages”. The overtly sexual content would suggest otherwise.

The same has taken place in reference to videos recorded inside Comet Ping Pong by people that frequent their establishment as well as video referencing Comet Ping Pong positively from the exterior.

While initially not the central focus of the investigation at the onset, Comet Ping Pong is a much more overt and much more disturbing hub of coincidences. Everyone associated with the business is making semi-overt, semi-tongue-in-cheek, and semi-sarcastic inferences towards sex with minors. The artists that work for and with the business also generate nothing but cultish imagery of disembodiment, blood, beheadings, sex, and of course pizza."""

In [300]:
#SINGLE EXAMPLE

print("original_text: {0} words, {1} characters".format(len(pizza_gate_text.split()), len(pizza_gate_text)))
print()
print(pizza_gate_text)

pizza_gate_filtered_text = remove_cap_punc(pizza_gate_text)
print()
print("Removed cap/punc: {0} words, {1} characters".format(len(pizza_gate_filtered_text.split()), len(pizza_gate_filtered_text)))
print()
print(pizza_gate_filtered_text)
       
pizza_gate_filtered_text = remove_overfit_words(pizza_gate_filtered_text, wordlist=wordlist, sourcelist=sourcelist, phraselist=phraselist)
print()
print("Removed overfit words/phrases: {0} words, {1} characters".format(len(pizza_gate_filtered_text.split()), len(pizza_gate_filtered_text)))
print()
print(pizza_gate_filtered_text)

pizza_gate_filtered_text = remove_shortwords(pizza_gate_filtered_text)
print()
print("Removed short words: {0} words, {1} characters".format(len(pizza_gate_filtered_text.split()), len(pizza_gate_filtered_text)))
print()
print(pizza_gate_filtered_text)

pizza_gate_filtered_text = remove_stopwords(pizza_gate_filtered_text)
pizza_gate_filtered_text = remove_shortwords(pizza_gate_filtered_text)
print()
print("Removed stop words: {0} words, {1} characters".format(len(pizza_gate_filtered_text.split()), len(pizza_gate_filtered_text)))
print()
print(pizza_gate_filtered_text)

pizza_gate_filtered_text = stem_words(pizza_gate_filtered_text)
print()
print("Removed stop words: {0} words, {1} characters".format(len(pizza_gate_filtered_text.split()), len(pizza_gate_filtered_text)))
print()
print(pizza_gate_filtered_text)

print()
print("Lexical Diversity: {}".format(lexical_diversity(pizza_gate_filtered_text)))
print("Punctuation Analysis: {}".format(pct_punct_quesexcl(pizza_gate_text)))

print(clf.predict(np.array([lexical_diversity(pizza_gate_filtered_text), pct_punct_quesexcl(pizza_gate_text)]).reshape(1,-1)))
print(clf.predict_proba(np.array([lexical_diversity(pizza_gate_filtered_text), pct_punct_quesexcl(pizza_gate_text)]).reshape(1,-1)))

original_text: 275 words, 1761 characters

Comet Pizza is a pizza place owned by James Alefantis, who is the former gay boyfriend of David Brock, the CEO of Correct The Record. It has been the venue for dozens of events for the Hillary campaign staff. John Podesta has had campaign fundraisers there for both Barack Obama and Hillary Clinton. John’s brother and business partner Tony Podesta has his birthday party there every year. [https://i.sli.mg/1MqPHA.png]

It’s also a dive that according to reviews and photos has hidden bathroom doors and creepy murals. The bathrooms in particular have murals exclusively of nude women, as well as a great deal of graffiti relating to sex. Reviews of the restaurant are bizarrely polarized. Websites describing it positively note that there are regularly “unsupervised children running around”. Their menu include a pedophilic symbol, as do the signs and decorations of other neighboring businesses.

The music acts and the posters promoting same acts are b