In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from string import punctuation
from collections import Counter
import re
import requests
from datetime import datetime
#from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
pd.options.mode.chained_assignment = None



In [2]:
cred_fp = '/ebs_volume/data/Credible/'
ncred_fp = '/ebs_volume/data/notCredible/'

In [3]:
articles = pd.DataFrame(columns=('label',
                                 'text',
                                 'title',
                                 'date',
                                 'source',
                                 'images',
                                 'videos',
                                 'url'))
i = 0    
for root, dirs, files in os.walk(cred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [0,data["text"],data["title"],data["date"],data["source"],data["images"],data["videos"],data["url"]]
                    i+=1
                except ValueError:
                    continue

for root, dirs, files in os.walk(ncred_fp):
    for file in files:
        if file.endswith(".txt") and 'api' not in file:
             curr_file = os.path.join(root, file)
             #print curr_file
             with open(curr_file) as json_file:
                try:
                    data = json.load(json_file)
                    articles.loc[i] = [1,data["text"],data["title"],data["date"],data["source"],data["images"],data["videos"],data["url"]]
                    i+=1
                except ValueError:
                    continue

In [4]:
train, test = train_test_split(articles, test_size = 0.2)
print("train data shape:", train.shape)
print("test data shape:", test.shape)

train data shape: (3311, 8)
test data shape: (828, 8)


In [5]:
len(articles[articles["label"]==1.])

3228

In [6]:
len(articles[articles["label"]==0.])

911

In [7]:
articles.head()

Unnamed: 0,label,text,title,date,source,images,videos,url
0,0.0,View Images An uncrewed Dragon capsule makes t...,SpaceX Plans to Send People to the Moon in 201...,02-28-2017,national-geographic,[http://news.nationalgeographic.com/etc/design...,[https://www.youtube.com/embed/VJxvExL0pU4],http://news.nationalgeographic.com/2017/02/spa...
1,0.0,Watch: Impatient Elephant Disobeys Railway Rul...,Watch a Young Elephant Make a Surprising Railr...,02-28-2017,national-geographic,[http://pmdvod.nationalgeographic.com/NG_Video...,[],http://news.nationalgeographic.com/2017/02/ele...
2,0.0,View Images An advanced laser imaging techniqu...,This May Be Our Best Idea of What a Dinosaur R...,02-28-2017,national-geographic,[http://news.nationalgeographic.com/etc/design...,[],http://news.nationalgeographic.com/2017/02/anc...
3,0.0,WATCH: Visitors at Badaling Wildlife Park brea...,Bear Attacks Car In Chinese Zoo Famous For Tig...,02-28-2017,national-geographic,[http://pmdvod.nationalgeographic.com/NG_Video...,[https://www.youtube.com/embed/9xNzNqq-0n0?rel=0],http://news.nationalgeographic.com/2017/02/bea...
4,0.0,View Images Participants at the Slav and Vikin...,How to Eat Like a Viking,02-28-2017,national-geographic,[http://news.nationalgeographic.com/content/da...,[https://www.youtube.com/embed/oNEW4qPacGw],http://news.nationalgeographic.com/2017/02/eat...


In [8]:

print('articles shape:', articles.shape)

# remove duplicate description columns
articles = articles.drop_duplicates('text')
# remove rows with empty descriptions
articles = articles[~articles['text'].isnull()]
print('articles shape:', articles.shape)
articles['len'] = articles['text'].map(len)

articles = articles[articles.len > 140]
articles.reset_index(inplace=True)
articles.drop('index', inplace=True, axis=1)
print('articles shape:', articles.shape)

articles shape: (4139, 8)
articles shape: (3236, 8)
articles shape: (3124, 9)


In [9]:


#break the descriptions into sentences and then break the sentences into tokens
#remove punctuation and stop words
#lowercase the tokens
def tokenizer(text):
    try:
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', 
                                            u'\u2014', u'\u2026', u'\u2013'], tokens))
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Error as e:
        print(e)

In [10]:
articles['tokens'] = articles['text'].map(tokenizer)

In [11]:
for text, tokens in zip(articles['text'].head(5), articles['tokens'].head(5)):
    print('text:', text)
    print('tokens:', tokens)
    print() 

text: View Images An uncrewed Dragon capsule makes the journey to the International Space Station. The human-ready version has yet to fly in space.

Photograph by NASA

In a surprising and somewhat secretive press briefing, Elon Musk announced today that his company SpaceX intends to fly two paying passengers to the moon by late 2018. The pair reportedly approached SpaceX with the idea and have paid the company a “significant deposit.”

As envisioned, the mission would lift off from Pad 39A at NASA’s Kennedy Space Center in Cape Canaveral, Florida—the same launch pad from which the Apollo missions blasted off more than four decades ago, delivering astronauts into lunar orbit and onto the moon’s surface. The SpaceX passengers wouldn’t walk on the moon, though; the trip would slingshot them around the moon before returning to Earth.

“This presents an opportunity for humans to return to deep space for the first time in 45 years and they will travel faster and further into the Solar Syste

In [16]:
def keywords(source):
    tokens = articles[articles['source'] == source]['tokens']
    alltokens = []
    for token_list in tokens:
        alltokens += token_list
    counter = Counter(alltokens)
    return counter.most_common(10)

In [15]:
for source in set(data['source']):
    print('source :', source)
    print('top 10 keywords:', keywords(source))
    print('---')

source : p


KeyError: False

In [26]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [31]:
tfidf.columns

Index(['tfidf'], dtype='object')

In [30]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(30)

Unnamed: 0,tfidf
f,5.565835
v,5.565835
p,5.314521
b,5.20916
w,5.11385
g,4.946796
u,4.872688
c,4.739157
h,4.29287
n,4.144449
