# Import Data and subset Data

## 1. Import

In [1]:
import numpy as np
import pandas as pd
import sklearn
import datetime

In [545]:
news = pd.read_csv("../data_intermed/news_bert.csv")
news = news[news['text']!=" "] # remove empty entries

In [546]:
distilBert_title_raw = np.load('../data_intermed/distilBert_title.npy')
distilBert_text_raw = np.load('../data_intermed/distilBert_text.npy')

In [547]:
def createLabels(data, col_name):
    labels = data[col_name].values
    y = np.zeros(labels.shape)
    y[labels == 'fake'] = 1
    return y

In [614]:
distilBert_title = distilBert_title_raw[news.index]
distilBert_text = distilBert_text_raw[news.index]
y = createLabels(news, 'label')

print(distilBert_text.shape)

(44271, 768)


## 2. Subset Data to only politics

In [9]:
news.subject.unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [10]:
news_p = news[news['subject'].isin(['politicsNews','politics','Government News','left-news'])]

In [11]:
news_p.size
news_p.groupby('label').count().title

label
fake    12244
true    11271
Name: title, dtype: int64

In [615]:
distilBert_title_p = distilBert_title[news_p.index]
distilBert_text_p = distilBert_text[news_p.index]
y_p = createLabels(news_p, 'label')

print(distilBert_text_p.shape)

(23515, 768)


## 3. Combine title and text

In [13]:
distilBert_title_text = np.concatenate((distilBert_title, distilBert_text),1)
distilBert_title_text_p = np.concatenate((distilBert_title_p, distilBert_text_p),1)

In [16]:
distilBert_title_text.shape

(44271, 1536)

# Models

In [619]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import json

In [686]:
def modelEval(X_train, y_train, X_test, y_test, model, result_output_name, path = "../model_results/", printResults = True):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = model.fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # save results in dictionary
    model_dict = {}
    model_dict['train accuracy'] = clf.score(X_train, y_train)
    model_dict['test accuracy'] = clf.score(X_test, y_test)
    model_dict['train f-score'] = f1_score(y_train, y_pred_train)
    model_dict['test f-score'] = f1_score(y_test, y_pred_test)
    
    # output the dictionary
    with open(path + result_output_name + ".json", "w") as outfile:  
        json.dump(model_dict, outfile) 
    
    # print
    if printResults == True:
        print('train accuracy:', model_dict['train accuracy'])
        print('test accuracy:', model_dict['test accuracy'])
        print('train f-score:', model_dict['train f-score'])
        print('test f-score:', model_dict['test f-score'])
    
    return clf, model_dict

In [631]:
for i in ['title','text','title_text']:
    exec("X_train_"+i+",X_test_"+i+",y_train_"+i+",y_test_"+i+
         "= train_test_split(distilBert_" +i + ",y, test_size=0.33, random_state=42)")
    exec("X_train_"+i+"_p ,X_test_"+i+"_p ,y_train_"+i+"_p ,y_test_"+i+
         "_p = train_test_split(distilBert_" +i + "_p ,y_p, test_size=0.33, random_state=42)")

## 1. Naive Bayes (need to update)

In [679]:
from sklearn.naive_bayes import GaussianNB

In [687]:
gnb = GaussianNB()

In [690]:
clf_nb_title = modelEval(X_train_title, y_train_title, X_test_title, y_test_title,
                         gnb, "results_nb_title")

train accuracy: 0.8516570580897475
test accuracy: 0.8499657768651608
train f-score: 0.8537914534458695
test f-score: 0.8528859060402684


In [691]:
clf_nb_text = modelEval(X_train_text, y_train_text, X_test_text, y_test_text, 
                        gnb, "results_nb_text")

train accuracy: 0.9221199554971174
test accuracy: 0.9243668720054757
train f-score: 0.924809582709459
test f-score: 0.9270193514298922


In [692]:
clf_nb_title_text = modelEval(X_train_title_text, y_train_title_text, X_test_title_text, y_test_title_text,
                              gnb, "results_nb_title_text")

train accuracy: 0.9204005259431577
test accuracy: 0.920123203285421
train f-score: 0.9226434258379477
test f-score: 0.9223191106969314


In [694]:
clf_nb_title_p = modelEval(X_train_title_p, y_train_title_p, X_test_title_p, y_test_title_p,
                           gnb, "results_nb_title_p")

train accuracy: 0.8703268803554427
test accuracy: 0.8743556701030928
train f-score: 0.8731607375675172
test f-score: 0.8778654641112363


In [696]:
clf_nb_text_p = modelEval(X_train_text_p, y_train_text_p, X_test_text_p, y_test_text_p,
                          gnb, "results_nb_text_p")

train accuracy: 0.9178673437004126
test accuracy: 0.9252577319587629
train f-score: 0.9189426208970183
test f-score: 0.9268046441191318


In [697]:
clf_nb_title_text_p = modelEval(X_train_title_text_p, y_train_title_text_p, X_test_title_text_p, y_test_title_text_p, 
                                gnb, "results_nb_title_text_p")

train accuracy: 0.9294827039035227
test accuracy: 0.9371134020618557
train f-score: 0.9311946491608348
test f-score: 0.938877755511022


## 2. Logistic Regression

In [621]:
from sklearn.linear_model import LogisticRegression

In [622]:
# cross validation
def crossValidationLR(X, y, hyperparameters):
    scores = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    for h in hyperparameters:
        model = LogisticRegression(random_state=0, max_iter=2000, C=h)
        scores.append(cross_val_score(estimator=model, X=X_train, y=y_train, cv=10).mean())
        
    return scores

In [665]:
# cross validation
scoresLR_title_text = crossValidationLR(X_train_title_text, y_train_title_text, range(1,10))
scoresLR_title_text_p = crossValidationLR(X_train_title_text_p, y_train_title_text_p, range(1,10))

In [667]:
scoresLR_title_text, scoresLR_title_text_p

([0.9940116795387322,
  0.9942129880440165,
  0.9942632898548815,
  0.9943135916657468,
  0.9943135916657468,
  0.9943639187920678,
  0.9943135916657468,
  0.9942129374131046,
  0.9942129374131046],
 [0.9966841698980323,
  0.9968736535975872,
  0.9966840801378716,
  0.9966839903777107,
  0.9967786873474077,
  0.9968733843171046,
  0.9968733843171046,
  0.9968733843171046,
  0.9967786873474077])

### create the lr that is the best
lr = LogisticRegression(random_state=0, max_iter = 2000, C=6)

In [698]:
clf_lr_title = modelEval(X_train_title, y_train_title, X_test_title, y_test_title, lr, "results_lr_title")

train accuracy: 0.9730285560163178
test accuracy: 0.9679671457905544
train f-score: 0.9738014147236049
test f-score: 0.9688664183076103


In [699]:
clf_lr_text = modelEval(X_train_text, y_train_text, X_test_text, y_test_text, lr, "results_lr_text")

train accuracy: 0.9963925693671825
test accuracy: 0.9907597535934292
train f-score: 0.9965045245173303
test f-score: 0.9910423993099331


In [701]:
clf_lr_title_text = modelEval(X_train_title_text, y_train_title_text, X_test_title_text, y_test_title_text, 
                              lr, "results_lr_title_text")

train accuracy: 0.9999662856950204
test accuracy: 0.9964407939767282
train f-score: 0.9999673512031081
test f-score: 0.9965508092332185


In [702]:
clf_lr_title_p = modelEval(X_train_title_p, y_train_title_p, X_test_title_p, y_test_title_p,
                           lr, "results_lr_title_p")

train accuracy: 0.9859727070771184
test accuracy: 0.9779639175257732
train f-score: 0.9864823536607744
test f-score: 0.9788654060066742


In [703]:
clf_lr_text_p = modelEval(X_train_text_p, y_train_text_p, X_test_text_p, y_test_text_p, 
                          lr, "results_lr_text_p")

train accuracy: 0.9998095842589654
test accuracy: 0.9949742268041237
train f-score: 0.9998170173833487
test f-score: 0.9951810206351168


In [704]:
clf_lr_title_text_p = modelEval(X_train_title_text_p, y_train_title_text_p, X_test_title_text_p, y_test_title_text_p, 
                                lr, "results_lr_title_text_p")

train accuracy: 1.0
test accuracy: 0.9983247422680412
train f-score: 1.0
test f-score: 0.9983932764800395


## 3. Linear Support Vector Machine

In [654]:
from sklearn.svm import LinearSVC #l2 penalty

In [661]:
# cross validation
def crossValidationLSVM(X, y, hyperparameters):
    scores = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    for h in hyperparameters:
        model = LinearSVC(max_iter = 50000, C=h)
        scores.append(cross_val_score(estimator=model, X=X_train, y=y_train, cv=10).mean())
        
    return scores

In [662]:
# cross validation
scoresLSVM_title_text = crossValidationLSVM(X_train_title_text, y_train_title_text, range(1,10))
scoresLSVM_title_text_p = crossValidationLSVM(X_train_title_text_p, y_train_title_text_p, range(1,10))

In [668]:
scoresLSVM_title_text, scoresLSVM_title_text_p

([0.9935587860327543,
  0.9933575281583817,
  0.9933575281583817,
  0.9932568992211953,
  0.9931562702840091,
  0.9931562702840091,
  0.9931562702840091,
  0.9931562702840091,
  0.9931059431576881],
 [0.9968735638374264,
  0.9968733843171048,
  0.9968733843171048,
  0.9968733843171048,
  0.9968733843171048,
  0.9969680812868017,
  0.9969680812868017,
  0.9969680812868017,
  0.9969680812868017])

In [669]:
lsvc = LinearSVC(max_iter = 10000, C=1)
lsvc_p = LinearSVC(max_iter = 50000, C=6)

In [705]:
clf_lsvm_title = modelEval(X_train_title, y_train_title, X_test_title, y_test_title, 
                           lsvc, "results_lsvm_title")

train accuracy: 0.9732982704561546
test accuracy: 0.9679671457905544
train f-score: 0.9740566037735848
test f-score: 0.9688581314878894


In [706]:
clf_lsvm_text = modelEval(X_train_text, y_train_text, X_test_text, y_test_text, 
                          lsvc, "results_lsvm_text")

train accuracy: 0.9976062843464482
test accuracy: 0.9909650924024641
train f-score: 0.9976811783533099
test f-score: 0.9912443618997081


In [707]:
clf_lsvm_title_text = modelEval(X_train_title_text, y_train_title_text, X_test_title_text, y_test_title_text, 
                                lsvc, "results_lsvm_title_text")

train accuracy: 1.0
test accuracy: 0.9960301163586585
train f-score: 1.0
test f-score: 0.9961553758451545


In [708]:
clf_lsvm_title_p = modelEval(X_train_title_p, y_train_title_p, X_test_title_p, y_test_title_p, 
                             lsvc_p, "results_lsvm_title_p")

train accuracy: 0.9902887972072358
test accuracy: 0.9740979381443299
train f-score: 0.9906530637180035
test f-score: 0.975255447494768


In [709]:
clf_lsvm_text_p = modelEval(X_train_text_p, y_train_text_p, X_test_text_p, y_test_text_p, 
                            lsvc_p, "results_lsvm_text_p")

train accuracy: 1.0
test accuracy: 0.9942010309278351
train f-score: 1.0
test f-score: 0.9944396391943655


In [710]:
clf_lsvm_title_text_p = modelEval(X_train_title_text_p, y_train_title_text_p, X_test_title_text_p, y_test_title_text_p,
                                  lsvc_p, "results_lsvm_title_text_p")

train accuracy: 1.0
test accuracy: 0.997680412371134
train f-score: 1.0
test f-score: 0.9977755808205635


## 4 Neural Network (rerun with dropout 0.2)

In [162]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.optim as optim
from datetime import datetime
import glob, os

In [587]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, features, labels): #labels will be numpy (n,)
        'Initialization'
        self.features = torch.tensor(features, dtype=torch.float) #instance variables
        self.labels = torch.tensor(labels.reshape(-1,1), dtype=torch.float)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = self.features[index]
        y = self.labels[index]

        return X, y
    
    def getFeatures(self):
        return self.features

In [711]:
# create model class
class NeuralNetwork(nn.Module):
    """
    Simplr neural network to intake bert
    """
    
    def __init__(self, p):
        super(NeuralNetwork, self).__init__()
        self.hidden = 700
        self.l1 = nn.Linear(p, self.hidden, bias=True)  
        self.l2 = nn.Linear(self.hidden, 1, bias=True) 
        
    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1, 
            nn.Tanh(), 
            nn.BatchNorm1d()
            nn.Dropout(0.2), 
            self.l2, 
            nn.Sigmoid()
        )

        return model(x)

SyntaxError: invalid syntax (<ipython-input-711-d78502fc918a>, line 18)

In [636]:
# create function for training (similar to hw8)
def train(X, y, X_test, y_test, model, batch_size, num_epochs, criterion, optimizer):
    loss_curve = []
    accuracy_curve = []
    
    # create train test split
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    dataloader = DataLoader(Dataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    
    for epoch in range(num_epochs): # loop over each epoch
        epoch_loss = 0
        total = 0
        correct = 0
        
        for data in dataloader: # loop over each batch
            embeddings, labels = data
            outputs = model(embeddings.float())
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # save train performance
            epoch_loss+=loss.item()
            preds = 1*(outputs>0.5)
            correct += (preds.reshape(-1,1) == labels).sum().item()
            total += float(len(labels))
        
        # train loss and accuracy
        epoch_loss = epoch_loss / len(dataloader)
        epoch_accuracy = correct / total
        loss_curve.append(epoch_loss)
        accuracy_curve.append(epoch_accuracy)
        
        # validation accuracy 
        X_val_t = torch.tensor(X_val, dtype=torch.float) #instance variables
        y_val_t = torch.tensor(y_val.reshape(-1,1), dtype=torch.float)
        val_outputs = model(X_val_t.float())
        val_preds = 1*(val_outputs>0.5)
        val_accuracy = (val_preds.reshape(-1,1) == y_val_t).sum().item()/len(y_val)
        
        # test accuracy 
        X_test_t = torch.tensor(X_test, dtype=torch.float) #instance variables
        y_test_t = torch.tensor(y_test.reshape(-1,1), dtype=torch.float)
        test_outputs = model(X_test_t.float())
        test_preds = 1*(test_outputs>0.5)
        test_accuracy = (test_preds.reshape(-1,1) == y_test_t).sum().item()/len(y_test)
        
        print('epoch [{}/{}], mean epoch loss:{:.4f}, train acc:{:.4f}, val acc:{:.4f}, test acc:{:.4f}'.format(
            epoch + 1, num_epochs, epoch_loss, epoch_accuracy, val_accuracy, test_accuracy))
        
    return model, loss_curve 

In [637]:
# initiate other inputs
batch_size = 100
num_epochs = 20
learning_rate = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.BCELoss() 

model = NeuralNetwork(X_train_text.shape[1])
model_title_text = NeuralNetwork(X_train_title_text.shape[1])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer_title_text = optim.Adam(model_title_text.parameters(), lr=learning_rate)

In [638]:
# train models - only using text
nn_text, loss_curve_text = train(X_train_text, y_train_text, X_test_text, y_test_text, 
                                 model, batch_size, num_epochs, criterion, optimizer)

epoch [1/20], mean epoch loss:0.1205, train acc:0.9600, val acc:0.9776, test acc:0.9738
epoch [2/20], mean epoch loss:0.0736, train acc:0.9753, val acc:0.9843, test acc:0.9802
epoch [3/20], mean epoch loss:0.0602, train acc:0.9787, val acc:0.9826, test acc:0.9810
epoch [4/20], mean epoch loss:0.0513, train acc:0.9820, val acc:0.9848, test acc:0.9839
epoch [5/20], mean epoch loss:0.0494, train acc:0.9818, val acc:0.9675, test acc:0.9679
epoch [6/20], mean epoch loss:0.0435, train acc:0.9845, val acc:0.9870, test acc:0.9853
epoch [7/20], mean epoch loss:0.0396, train acc:0.9855, val acc:0.9879, test acc:0.9851
epoch [8/20], mean epoch loss:0.0399, train acc:0.9854, val acc:0.9874, test acc:0.9845
epoch [9/20], mean epoch loss:0.0354, train acc:0.9878, val acc:0.9804, test acc:0.9779
epoch [10/20], mean epoch loss:0.0368, train acc:0.9864, val acc:0.9815, test acc:0.9808
epoch [11/20], mean epoch loss:0.0328, train acc:0.9885, val acc:0.9880, test acc:0.9862
epoch [12/20], mean epoch loss

In [639]:
# train models - using text and title
nn_title_text, loss_curve_title_text = train(X_train_title_text, y_train_title_text, X_test_title_text, y_test_title_text,
                                             model_title_text, batch_size, num_epochs, criterion, optimizer_title_text)

epoch [1/20], mean epoch loss:0.0886, train acc:0.9659, val acc:0.9884, test acc:0.9882
epoch [2/20], mean epoch loss:0.0379, train acc:0.9866, val acc:0.9884, test acc:0.9889
epoch [3/20], mean epoch loss:0.0337, train acc:0.9886, val acc:0.9907, test acc:0.9901
epoch [4/20], mean epoch loss:0.0260, train acc:0.9910, val acc:0.9926, test acc:0.9925
epoch [5/20], mean epoch loss:0.0303, train acc:0.9890, val acc:0.9826, test acc:0.9843
epoch [6/20], mean epoch loss:0.0253, train acc:0.9919, val acc:0.9916, test acc:0.9921
epoch [7/20], mean epoch loss:0.0210, train acc:0.9928, val acc:0.9917, test acc:0.9913
epoch [8/20], mean epoch loss:0.0181, train acc:0.9937, val acc:0.9934, test acc:0.9935
epoch [9/20], mean epoch loss:0.0182, train acc:0.9933, val acc:0.9953, test acc:0.9929
epoch [10/20], mean epoch loss:0.0174, train acc:0.9941, val acc:0.9938, test acc:0.9924
epoch [11/20], mean epoch loss:0.0203, train acc:0.9924, val acc:0.9922, test acc:0.9926
epoch [12/20], mean epoch loss

In [643]:
# train models - only using text
nn_text_p, loss_curve_text_p = train(X_train_text_p, y_train_text_p, X_test_text_p, y_test_text_p, 
                                 model, batch_size, num_epochs, criterion, optimizer)

epoch [1/20], mean epoch loss:0.0265, train acc:0.9910, val acc:0.9921, test acc:0.9923
epoch [2/20], mean epoch loss:0.0198, train acc:0.9925, val acc:0.9956, test acc:0.9932
epoch [3/20], mean epoch loss:0.0155, train acc:0.9943, val acc:0.9930, test acc:0.9936
epoch [4/20], mean epoch loss:0.0157, train acc:0.9940, val acc:0.9930, test acc:0.9946
epoch [5/20], mean epoch loss:0.0143, train acc:0.9952, val acc:0.9946, test acc:0.9938
epoch [6/20], mean epoch loss:0.0126, train acc:0.9962, val acc:0.9930, test acc:0.9938
epoch [7/20], mean epoch loss:0.0129, train acc:0.9954, val acc:0.9943, test acc:0.9932
epoch [8/20], mean epoch loss:0.0183, train acc:0.9929, val acc:0.9937, test acc:0.9934
epoch [9/20], mean epoch loss:0.0119, train acc:0.9954, val acc:0.9943, test acc:0.9942
epoch [10/20], mean epoch loss:0.0178, train acc:0.9933, val acc:0.9949, test acc:0.9937
epoch [11/20], mean epoch loss:0.0108, train acc:0.9964, val acc:0.9927, test acc:0.9928
epoch [12/20], mean epoch loss

In [None]:
# train models - using text and title
nn_title_text_p, loss_curve_title_text_p = train(X_train_title_text_p, y_train_title_text_p, X_test_title_text_p, y_test_title_text_p,
                                             model_title_text, batch_size, num_epochs, criterion, optimizer_title_text)

## 6. LIME on Logistic Regression Model