In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from news_vocabulary_vectorizer_dataset import NewsDataset
from sklearn.metrics import accuracy_score

from news_classifier import *
from utilities import *

## Arguments

In [10]:
args = Namespace(
    frequency_cutoff=25,

    # Data and Path information
    news_csv='data/fake_news/preprocessed_FULL.csv',
    save_dir='model_storage/fake_news/',
    # A PTH file is a machine learning model created using PyTorch
    model_state_file='model.pth',
    vectorizer_file='news_vectorizer.json',
    train_state_file='train_state.json',

    # Model Hyper-parameters
    loss_func=nn.BCELoss(),
    score_func=accuracy_score,

    # Training Hyper-parameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=50,
    seed=42,

    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,

)
# Expand file paths
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    args.train_state_file = os.path.join(args.save_dir, args.train_state_file)

    print("Expanded file paths:")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")
    print(f"\t{args.train_state_file}")

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print(f"Using CUDA: {args.cuda}")

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded file paths:
	model_storage/fake_news/news_vectorizer.json
	model_storage/fake_news/model.pth
	model_storage/fake_news/train_state.json
Using CUDA: True
Directory for model storage exists at : model_storage/fake_news/


In [11]:
data = pd.read_csv(args.news_csv)
test_data = data[data.split=='test'].copy()

In [12]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,title,author,text,label,split
20800,21689,iraqis in mosul find us missiles at captured i...,Rowan Wolf,photo us missiles found in isis stronghold in...,,test
20801,23024,colin is looking for a songwriting partner,Poke Staff,next swipe left right colin is looking for a s...,,test
20802,25885,even hillary s niece is voting trump,Gillian,leave a reply she wants to be first woman pres...,,test
20803,21487,hillary tells massive lie this time it s about...,The Conservative Millennial,"comments at a florida rally , hillary clinton...",,test
20804,21600,"trump , russia , and the news story that wasn ...",Liz Spayd,late september was a frantic period for new yo...,,test


## Initialization

In [13]:
if args.reload_from_files:
    # Create dataset using class method
    print("Loading dataset and vectorizer...")
    dataset = NewsDataset.load_dataset_and_load_vectorizer(args.news_csv, args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer...")
    dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
    dataset.save_vectorizer(args.vectorizer_file)

print("Dataset and vectorizer loaded")

vectorizer = dataset.get_vectorizer()

classifier = Perceptron(num_features=len(vectorizer.title_vocab) + len(vectorizer.text_vocab))
print("Classifier instantiated ")

Loading dataset and creating vectorizer...
Dataset and vectorizer loaded
Classifier instantiated 


## Train network

In [14]:
classifier.to(args.device)

optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
args.optimizer = optimizer

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
args.scheduler = scheduler

train_state = make_train_state(args)
train_state = train_network(classifier, dataset, train_state, args)

save_train_state(train_state, args)
train_state_to_dataframe(train_state, ['train_loss', 'train_acc', 'val_loss', 'val_acc'])

Classifier in training...


  0%|          | 0/50 [00:00<?, ?it/s]

split=train, epoch=0:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=0:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=1:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=1:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=2:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=2:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=3:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=3:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=4:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=4:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=5:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=5:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=6:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=6:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=7:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=7:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=8:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=8:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=9:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=9:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=10:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=10:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=11:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=11:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=12:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=12:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=13:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=13:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=14:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=14:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=15:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=15:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=16:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=16:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=17:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=17:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=18:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=18:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=19:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=19:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=20:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=20:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=21:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=21:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=22:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=22:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=23:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=23:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=24:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=24:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=25:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=25:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=26:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=26:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=27:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=27:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=28:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=28:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=29:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=29:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=30:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=30:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=31:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=31:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=32:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=32:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=33:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=33:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=34:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=34:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=35:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=35:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=36:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=36:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=37:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=37:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=38:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=38:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=39:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=39:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=40:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=40:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=41:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=41:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=42:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=42:   0%|          | 0/33 [00:00<?, ?it/s]

split=train, epoch=43:   0%|          | 0/130 [00:00<?, ?it/s]

split=val, epoch=43:   0%|          | 0/33 [00:00<?, ?it/s]

stopping early...
Classifier training done.


Unnamed: 0,train_loss,train_acc,val_loss,val_acc
0,0.4047,0.8944,0.2827,0.9433
1,0.2195,0.9651,0.208,0.9563
2,0.1578,0.9786,0.1714,0.9608
3,0.123,0.9855,0.1482,0.9664
4,0.0997,0.9899,0.1327,0.9688
5,0.0834,0.9927,0.1216,0.9704
6,0.0709,0.9945,0.1125,0.9704
7,0.0614,0.996,0.1058,0.9721
8,0.0537,0.9968,0.1015,0.9726
9,0.0474,0.9974,0.0973,0.9721


In [15]:
train_state = load_train_state(args)
train_state_to_dataframe(train_state, ['train_loss', 'train_acc', 'val_loss', 'val_acc'])

Unnamed: 0,train_loss,train_acc,val_loss,val_acc
0,0.4047,0.8944,0.2827,0.9433
1,0.2195,0.9651,0.208,0.9563
2,0.1578,0.9786,0.1714,0.9608
3,0.123,0.9855,0.1482,0.9664
4,0.0997,0.9899,0.1327,0.9688
5,0.0834,0.9927,0.1216,0.9704
6,0.0709,0.9945,0.1125,0.9704
7,0.0614,0.996,0.1058,0.9721
8,0.0537,0.9968,0.1015,0.9726
9,0.0474,0.9974,0.0973,0.9721


## Test Network

In [16]:
train_state = load_train_state(args)

classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)


y_pred = test_network(classifier, dataset, train_state, args)

test_data['pred'] = y_pred

test_data.sample(5)

split=test, epoch=43:   0%|          | 0/41 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,title,author,text,label,split,pred
21926,25737,re rights ? in the new america you don t get a...,Chris Harper,rights ? in the new america you don t get any ...,,test,1.0
20879,22515,michael phelps makes political super bowl pick...,AP,houston ap michael phelps approaches his super...,,test,0.0
24060,21231,how does the soviet past affect russia s relat...,"RBTH, Nikolai Shevchenko","fall of the ussr , valdai , sochi in days when...",,test,1.0
22536,22721,"shutting down speech by elizabeth warren , g ....",Matt Flegenheimer,washington republicans seized her microphone ....,,test,0.0
25326,21501,"bored , broke and armed clues to chicago s gan...",John Eligon,chicago the young men who call themselves gang...,,test,0.0


In [17]:
fc1_weights = classifier.fc1.weight.detach()[0]

title_weights = fc1_weights[: len(vectorizer.title_vocab)].cpu()
text_weights = fc1_weights[len(vectorizer.title_vocab):].cpu()

_, indices = torch.sort(text_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

# Top 20 words
print("Influential words in Fake News:")
print("--------------------------------------")
for i in range(50):
    print(vectorizer.text_vocab.lookup_index(indices[i]))
    
print("====\n\n\n")

# Top 20 negative words
print("Influential words in True News:")
print("--------------------------------------")
indices.reverse()
for i in range(50):
    print(vectorizer.text_vocab.lookup_index(indices[i]))

Influential words in Fake News:
--------------------------------------
pravda
ru
elect
october
loading
snip
november
oct
anti
sputnik
print
nan
co
mins
rt
non
share
swipe
tags
shares
url
pro
self
mid
comments
al
source
nominee
telegram
ex
old
disqus
via
prev
www
utc
thirds
newsthump
ny
pre
tass
multi
nov
flickr
mailbox
getty
http
navigation
km
reprinted
====



Influential words in True News:
--------------------------------------
ap
breitbart
follow
twitter
pamkeynen
pam
hanchett
ianhanchett
jerome
jeromeehudson
magnifitrent
milo
saturday
trent
le
nussbaum
dznussbaum
baker
march
airs
siriusxm
tdwilliamsrome
ian
gwinn
themightygwinn
hudson
weekdays
sunday
igcolonel
warnerthuston
hotmail
sequel
huston
bkew
macron
ms
tillerson
kew
president
emmanuel
spicer
mattis
marlow
kassam
raheem
nationals
haley
nolan
warner
friday
