In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB



In [2]:
cred_fp = '/ebs_volume/data/Credible/'
ncred_fp = '/ebs_volume/data/notCredible/'

In [3]:
articles = pd.DataFrame(columns=('label',
                                 'text',
                                 'title',
                                 'date',
                                 'source',
                                 'images',
                                 'videos',
                                 'url'))
i = 0    
for root, dirs, files in os.walk(cred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [0,data["text"],data["title"],data["date"],data["source"],data["images"],data["videos"],data["url"]]
                    i+=1
                except ValueError:
                    continue

for root, dirs, files in os.walk(ncred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [1,data["text"],data["title"],data["date"],data["source"],data["images"],data["videos"],data["url"]]
                    i+=1
                except ValueError:
                    continue

In [4]:
articles.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,title,date,source,url
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,count,1592.0,1592,1592,1592,1592
0.0,unique,1449.0,1448,36,12,1489
0.0,top,,Article 50,04-04-2017,the-washington-post,http://www.independent.co.uk/topic/article-50
0.0,freq,20.0,7,65,180,7
1.0,count,4124.0,4124,4124,4124,4124
1.0,unique,3248.0,3363,43,14,3329
1.0,top,,"John McCain Illegally Travels To Syria, Meets ...",02-25-2017,activistpost,http://www.activistpost.com/2017/02/huge-week-...
1.0,freq,46.0,11,169,636,7


In [5]:
cred_articles = articles[articles["label"]==0.0]
num_cred_articles = len(cred_articles)
print("Number of credible articles in corpus: {}".format(num_cred_articles))

Number of credible articles in corpus: 1592


In [6]:
noncred_articles = articles[articles["label"]==1.0].sample(n=num_cred_articles)
even_articles = pd.concat([cred_articles, noncred_articles])

In [7]:
import re
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [8]:
even_articles['sentences']=even_articles['text'].apply(split_into_sentences)

In [9]:
print(even_articles['sentences'][0])

['View Images An uncrewed Dragon capsule makes the journey to the International Space Station.', 'The human-ready version has yet to fly in space.', 'Photograph by NASA  In a surprising and somewhat secretive press briefing, Elon Musk announced today that his company SpaceX intends to fly two paying passengers to the moon by late 2018.', 'The pair reportedly approached SpaceX with the idea and have paid the company a “significant deposit”.', 'As envisioned, the mission would lift off from Pad 39A at NASA’s Kennedy Space Center in Cape Canaveral, Florida—the same launch pad from which the Apollo missions blasted off more than four decades ago, delivering astronauts into lunar orbit and onto the moon’s surface.', 'The SpaceX passengers wouldn’t walk on the moon, though; the trip would slingshot them around the moon before returning to Earth.', '“This presents an opportunity for humans to return to deep space for the first time in 45 years and they will travel faster and further into the 

In [10]:

import sys
import csv
from collections import defaultdict


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def sent_analysis(in_string):
    sid = SentimentIntensityAnalyzer()
    counter=0
    total_neg=0
    total_pos=0
    total_neu=0
    total_compound=0
    sent=[]
    for sentence in in_string:
        #print(sentence)
        ss = sid.polarity_scores(sentence)
        total_neg=total_neg+ss['neg']
        total_pos=total_pos+ss['pos']
        total_neu=total_neu+ss['neu']
        total_compound=total_compound+ss['compound']
        counter=counter+1
        #for k in sorted(ss):
            #  print('{0}: {1}, '.format(k, ss[k]), end='')
        #print()
    sent.append(total_neg)
    sent.append(total_pos)
    sent.append(total_neu)
    # print (total_neg)
    #print (total_pos)
    #print (total_compound)
    if counter==0:
        avg_compound=0
    else:
        avg_compound=total_compound/(counter)

    sent.append(avg_compound)

    #print(avg_compound)
    return sent

In [14]:
even_articles['sent']=even_articles['sentences'].apply(sent_analysis)

In [16]:
train, test = train_test_split(even_articles, test_size = 0.2)
print("train data shape:", train.shape)
print("test data shape:", test.shape)

train data shape: (2547, 10)
test data shape: (637, 10)


In [85]:
#even_articles['sent'].head(20)
#print(even_articles['sent'][1])
#print(even_articles['sent'][1][1])

objs = pd.DataFrame(even_articles['sent'].tolist())
#pd.concat(objs, axis=1).drop('sent', axis=1)
#print(objs)
#test=np.array(objs)
#print(test)
#test=even_articles['sent'].reshape((-1,1))
objs=np.array(objs)
print(objs.shape)
test2=objs.reshape(3184,3)
print(test.shape)
print(test2.shape)
#sent_df = objs.apply(lambda x: pd.Series(x.split(',')))
#print(sent_df)

(3184, 3)
(3184, 1)
(3184, 3)


In [91]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression




k_fold = KFold(n_splits=5, shuffle=True)
scores = []
f_scores=[]
confusion = np.array([[0, 0], [0, 0]])
for train_index, test_index in k_fold.split(even_articles):
    train_sent = train.iloc[train_indices]['sent'].values
    
    
    train_objs = pd.DataFrame(train_sent.tolist())
    train_objs=np.array(train_objs)
    
    train_y = train.iloc[train_indices]['label'].values
    
    
    print("train data shape:", train_objs.shape)
    print("train classification shape", train_y.shape)

    test_sent = train.iloc[test_indices]['sent'].values
    test_objs = pd.DataFrame(test_sent.tolist())
    test_objs=np.array(test_objs)
    test_y = train.iloc[test_indices]['label'].values
    test_sent=np.array(test_sent)
    clf = LogisticRegression().fit(train_objs, train_y)
    #clf.fit(train_sent, train_y)
    predictions = clf.predict(test_objs)

    confusion += confusion_matrix(test_y, predictions)
    f_score = f1_score(test_y, predictions)
    score = accuracy_score(test_y, predictions)
    scores.append(score)
    f_scores.append(f_score)
    
print ('Cross Validation Metrics')
print('Total articles classified:', len(train))
print('Accuracy Score:', round(sum(scores)/len(scores),3))
print('F1 Score:', round(sum(f_scores)/len(f_scores),3))
print('Confusion matrix:')
print(confusion)

train data shape: (2123, 3)
train classification shape (2123,)
train data shape: (2123, 3)
train classification shape (2123,)
train data shape: (2123, 3)
train classification shape (2123,)
train data shape: (2123, 3)
train classification shape (2123,)
train data shape: (2123, 3)
train classification shape (2123,)
Cross Validation Metrics
Total articles classified: 2547
Accuracy Score: 0.627
F1 Score: 0.649
Confusion matrix:
[[600 435]
 [355 730]]


In [94]:
import xgboost as xgb

k_fold = KFold(n_splits=5, shuffle=True)
scores = []
f_scores=[]
confusion = np.array([[0, 0], [0, 0]])
for train_index, test_index in k_fold.split(even_articles):
    train_sent = train.iloc[train_indices]['sent'].values
    
    
    train_objs = pd.DataFrame(train_sent.tolist())
    train_objs=np.array(train_objs)
    
    train_y = train.iloc[train_indices]['label'].values
    
    
    print("train data shape:", train_objs.shape)
    print("train classification shape", train_y.shape)
    dtrain=xgb.DMatrix(train_objs, label=train_y)
    test_sent = train.iloc[test_indices]['sent'].values
    test_objs = pd.DataFrame(test_sent.tolist())
    test_objs=np.array(test_objs)
    test_y = train.iloc[test_indices]['label'].values
    test_sent=np.array(test_sent)
    dtest=xgb.DMatrix(test_objs, label=test_y)
    
    params={'eval_metric':['auc','error'],'eta': 0.1, 'seed': 0, 'objective':'binary:logistic', 'max_depth':6 }
    num_rounds=100
    bst=xgb.train(params, dtrain, num_rounds,evallist)
    predictions=bst.predict(dtest)

    confusion += confusion_matrix(test_y, predictions)
    f_score = f1_score(test_y, predictions)
    score = accuracy_score(test_y, predictions)
    scores.append(score)
    f_scores.append(f_score)
    
print ('Cross Validation Metrics')
print('Total articles classified:', len(train))
print('Accuracy Score:', round(sum(scores)/len(scores),3))
print('F1 Score:', round(sum(f_scores)/len(f_scores),3))
print('Confusion matrix:')
print(confusion)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
import matplotlib.pyplot as plt

