In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from news_vocabulary_vectorizer_dataset import NewsDataset
from sklearn.metrics import accuracy_score

from news_classifier import *
from utilities import *

## Arguments

In [2]:
args = Namespace(
    frequency_cutoff=25,

    # Data and Path information
    news_csv='data/fake_news/preprocessed_LITE.csv',
    save_dir='model_storage/fake_news/',
    # A PTH file is a machine learning model created using PyTorch
    model_state_file='model.pth',
    vectorizer_file='news_vectorizer.json',
    train_state_file='train_state.json',

    # Model Hyper-parameters
    loss_func=nn.BCELoss(),
    score_func=accuracy_score,

    # Training Hyper-parameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=50,
    seed=42,

    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=True,

)
# Expand file paths
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    args.train_state_file = os.path.join(args.save_dir, args.train_state_file)

    print("Expanded file paths:")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")
    print(f"\t{args.train_state_file}")

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print(f"Using CUDA: {args.cuda}")

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded file paths:
	model_storage/fake_news/news_vectorizer.json
	model_storage/fake_news/model.pth
	model_storage/fake_news/train_state.json
Using CUDA: True
Directory for model storage exists at : model_storage/fake_news/


In [10]:
data = pd.read_csv(args.news_csv)
test_data = data[data.split=='test'].copy()

In [11]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,title,author,text,label,split
2080,22090,,Jethro,lots of t convertibles in iraq alone . honestl...,,test
2081,22977,the downside to cord cutting the new york times,Brian X. Chen,"this year , michael gartenberg , a former tech...",,test
2082,21942,romans launch protest against pope francis whe...,"Thomas D. Williams, Ph.D.",romans woke up saturday to find their city pla...,,test
2083,22652,roman polanski set for return to cannes with b...,Daniel Nussbaum,roman polanski s latest film has been added to...,,test
2084,24258,witches unite to cast binding spell on trump a...,"Thomas D. Williams, Ph.D.",a group of witches is attempting to use black ...,,test


## Initialization

In [12]:
if args.reload_from_files:
    # Create dataset using class method
    print("Loading dataset and vectorizer...")
    dataset = NewsDataset.load_dataset_and_load_vectorizer(args.news_csv, args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer...")
    dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
    dataset.save_vectorizer(args.vectorizer_file)

print("Dataset and vectorizer loaded")

vectorizer = dataset.get_vectorizer()

classifier = Perceptron(num_features=len(vectorizer.title_vocab) + len(vectorizer.text_vocab))
print("Classifier instantiated ")

Loading dataset and creating vectorizer...
Dataset and vectorizer loaded
Classifier instantiated 


## Train network

In [13]:
classifier.to(args.device)

optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
args.optimizer = optimizer

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
args.scheduler = scheduler

train_state = make_train_state(args)
train_state = train_network(classifier, dataset, train_state, args)

save_train_state(train_state, args)
train_state_to_dataframe(train_state, ['train_loss', 'train_acc', 'val_loss', 'val_acc'])

Classifier in training...


  0%|          | 0/50 [00:00<?, ?it/s]

split=train, epoch=0:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=0:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=1:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=1:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=2:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=2:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=3:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=3:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=4:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=4:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=5:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=5:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=6:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=6:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=7:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=7:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=8:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=8:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=9:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=9:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=10:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=10:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=11:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=11:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=12:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=12:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=13:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=13:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=14:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=14:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=15:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=15:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=16:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=16:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=17:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=17:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=18:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=18:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=19:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=19:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=20:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=20:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=21:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=21:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=22:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=22:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=23:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=23:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=24:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=24:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=25:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=25:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=26:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=26:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=27:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=27:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=28:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=28:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=29:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=29:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=30:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=30:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=31:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=31:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=32:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=32:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=33:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=33:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=34:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=34:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=35:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=35:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=36:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=36:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=37:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=37:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=38:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=38:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=39:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=39:   0%|          | 0/4 [00:00<?, ?it/s]

split=train, epoch=40:   0%|          | 0/13 [00:00<?, ?it/s]

split=val, epoch=40:   0%|          | 0/4 [00:00<?, ?it/s]

stopping early...
Classifier training done.


TypeError: train_state_to_dataframe() missing 1 required positional argument: 'columns'

In [3]:
train_state = load_train_state(args)
train_state_to_dataframe(train_state, ['train_loss', 'train_acc', 'val_loss', 'val_acc'])

Unnamed: 0,train_loss,train_acc,val_loss,val_acc
0,0.6528,0.6392,0.6084,0.8609
1,0.5464,0.8731,0.5034,0.8321
2,0.4726,0.9206,0.4677,0.9353
3,0.4194,0.9471,0.4238,0.9424
4,0.378,0.9513,0.3966,0.9448
5,0.3454,0.9567,0.3702,0.9472
6,0.3175,0.9663,0.3514,0.9472
7,0.2944,0.9723,0.3324,0.9472
8,0.2748,0.9741,0.3345,0.952
9,0.2567,0.9759,0.3133,0.9472


## Test Network

In [10]:
train_state = load_train_state(args)

classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)


y_pred = test_network(classifier, dataset, train_state, args)

test_data['pred'] = y_pred

test_data.sample(5)

split=test, epoch=40:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,title,author,text,label,split,pred
2095,20846,exclusive rep . dave brat ryancare a perverse ...,Neil W. McCabe,virginia republican congressman dave brat told...,,test,0.0
2541,25579,military veteran standing rock is the first ti...,Heather Callaghan,by will griffin i ve been on the wrong side of...,,test,1.0
2528,25139,comment on trips the story of how intellectual...,Foppe,by yves smith yves here . this real news netwo...,,test,1.0
2111,22226,"trump overrules tillerson , rejecting elliott ...","Maggie Haberman, Jonathan Weisman and Eric Lic...","president trump , japan s prime minister , sh...",,test,0.0
2207,22382,too many millennials are cool with communism,Iron Sheik,home society us news too many millennials are ...,,test,1.0


In [10]:
fc1_weights = classifier.fc1.weight.detach()[0]

title_weights = fc1_weights[: len(vectorizer.title_vocab)].cpu()
text_weights = fc1_weights[len(vectorizer.title_vocab):].cpu()

_, indices = torch.sort(text_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

# Top 20 words
print("Influential words in Fake News:")
print("--------------------------------------")
for i in range(50):
    print(vectorizer.text_vocab.lookup_index(indices[i]))
    
print("====\n\n\n")

# Top 20 negative words
print("Influential words in True News:")
print("--------------------------------------")
indices.reverse()
for i in range(50):
    print(vectorizer.text_vocab.lookup_index(indices[i]))

Influential words in Fake News:
--------------------------------------
hillary
please
print
october
self
clinton
www
video
<UNK>
source
share
usa
doj
lo
email
corrupt
click
secret
et
org
pm
possibly
soros
que
post
bundy
un
co
earth
completely
voting
er
con
currently
non
posted
mail
super
trunews
aware
encourage
actually
reveals
por
subscribe
dump
masses
el
lie
jesus
====



Influential words in True News:
--------------------------------------
breitbart
twitter
mrs
follow
thursday
stephen
seemed
icon
hudson
basketball
whether
pamkeynen
nytimes
mr
played
spoke
maryland
signed
sports
statement
drove
calif
host
saturday
labor
msnbc
ownership
traveled
bannon
said
senator
discrimination
senior
flynn
suspects
neighborhood
stadium
disney
broad
faster
march
eliminate
mcconnell
actor
song
strategist
december
restaurants
brooklyn
sport
