# Import Data and subset Data

## 1. Import

In [1]:
import numpy as np
import pandas as pd
import sklearn
import datetime

In [7]:
news = pd.read_csv("../data_intermed/news_bert.csv")
news = news[news['text']!=" "] # remove empty entries

In [440]:
distilBert_title = np.load('../data_intermed/distilBert_title.npy')
distilBert_text = np.load('../data_intermed/distilBert_text.npy')

In [6]:
def createLabels(data, col_name):
    labels = data[col_name].values
    y = np.zeros(labels.shape)
    y[labels == 'fake'] = 1
    return y

In [8]:
distilBert_title = distilBert_title[news.index]
distilBert_text = distilBert_text[news.index]
y = createLabels(news, 'label')

print(distilBert_text.shape)
# print(y.sum(), len(y))

(44271, 768)


## 2. Subset Data to only politics

In [9]:
news.subject.unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [10]:
news_p = news[news['subject'].isin(['politicsNews','politics','Government News','left-news'])]

In [11]:
news_p.size
news_p.groupby('label').count().title

label
fake    12244
true    11271
Name: title, dtype: int64

In [12]:
distilBert_title_p = distilBert_title[news_p.index]
distilBert_text_p = distilBert_text[news_p.index]
y_p = createLabels(news_p, 'label')

print(distilBert_text_p.shape)
# print(y_p.sum(), len(y))

(23515, 768)


## 3. Combine title and text

In [13]:
distilBert_title_text = np.concatenate((distilBert_title, distilBert_text),1)
distilBert_title_text_p = np.concatenate((distilBert_title_p, distilBert_text_p),1)

In [16]:
distilBert_title_text.shape

(44271, 1536)

# Models

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import json

In [309]:
def modelEval(X, y, test_perc, model, result_output_name, path = "../model_results/", printResults = True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = model.fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # save results in dictionary
    model_dict = {}
    model_dict['train accuracy'] = clf.score(X_train, y_train)
    model_dict['test accuracy'] = clf.score(X_test, y_test)
    model_dict['train f-score'] = f1_score(y_train, y_pred_train)
    model_dict['test f-score'] = f1_score(y_test, y_pred_test)
    
    # output the dictionary
    with open(path + result_output_name + ".json", "w") as outfile:  
        json.dump(model_dict, outfile) 
    
    # print
    if printResults == True:
        print('train accuracy:', model_dict['train accuracy'])
        print('test accuracy:', model_dict['test accuracy'])
        print('train f-score:', model_dict['train f-score'])
        print('test f-score:', model_dict['test f-score'])
    
    return clf, model_dict

## 1. Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

In [28]:
# cross validation
def crossValidationLR(X, y, hyperparameters):
    scores = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    for h in hyperparameters:
        model = LogisticRegression(random_state=0, max_iter=2000, C=h)
        scores.append(cross_val_score(estimator=model, X=X_train, y=y_train, cv=10).mean())
        
    return scores

In [44]:
# cross validation
scores_title_text = crossValidationLR(distilBert_title_text, y, range(1,10))
scores_title_text_p = crossValidationLR(distilBert_title_text_p, y_p, range(1,10))

In [35]:
scores_title_text, scores_title_text_p

([0.9953136786058193, 0.9959879760757863],
 [0.9974612440576907, 0.9977785432277816])

In [184]:
# create the lr that is the best
lr = LogisticRegression(random_state=0, max_iter = 2000)

In [222]:
clf_lr_title = modelEval(distilBert_title, y, 0.33, lr, "results_lr_title")

train accuracy: 0.9698931256532146
test accuracy: 0.9671457905544147
train f-score: 0.970712669312256
test f-score: 0.9680468645985887


In [224]:
clf_lr_title_p = modelEval(distilBert_title_p, y_p, 0.33, lr, "results_lr_title_p")

train accuracy: 0.9794985718819422
test accuracy: 0.9775773195876288
train f-score: 0.9802022678516702
test f-score: 0.9784119106699752


In [225]:
clf_lr_text = modelEval(distilBert_text, y, 0.33, lr, "results_lr_text")

train accuracy: 0.9925154242945282
test accuracy: 0.9888432580424367
train f-score: 0.9927417772837246
test f-score: 0.9891658358258558


In [226]:
clf_lr_text_p = modelEval(distilBert_text_p, y_p, 0.33, lr, "results_lr_text_p")

train accuracy: 0.9964455728340209
test accuracy: 0.993298969072165
train f-score: 0.9965841161400513
test f-score: 0.993570722057369


In [228]:
clf_lr_title_text = modelEval(distilBert_title_text, y, 0.33, lr, "results_lr_title_text")

train accuracy: 0.998550284885877
test accuracy: 0.9958932238193019
train f-score: 0.998595918367347
test f-score: 0.9960180514998673


In [227]:
clf_lr_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, lr, "results_lr_title_text_p")

train accuracy: 0.9997461123452872
test accuracy: 0.9978092783505155
train f-score: 0.9997560380580629
test f-score: 0.9978989000123594


## 2. Support Vector Machine

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [169]:
svc = SVC()

In [229]:
clf_svm_title = modelEval(distilBert_title, y, 0.33, svc, "results_svm_title")

train accuracy: 0.9607902633087219
test accuracy: 0.9605749486652977
train f-score: 0.9617169755423154
test f-score: 0.961456102783726


In [233]:
clf_svm_title_p = modelEval(distilBert_title_p, y_p, 0.33, svc, "results_svm_title_p")

train accuracy: 0.967185020628372
test accuracy: 0.9686855670103093
train f-score: 0.96811987420608
test f-score: 0.9696212026503312


In [234]:
clf_svm_text = modelEval(distilBert_text, y, 0.33, svc, "results_svm_text")

train accuracy: 0.9856377060786892
test accuracy: 0.9835728952772074
train f-score: 0.9860428543345785
test f-score: 0.9840234322992943


In [235]:
clf_svm_text_p = modelEval(distilBert_text_p, y_p, 0.33, svc, "results_svm_text_p")

train accuracy: 0.9871152015233259
test accuracy: 0.9867268041237114
train f-score: 0.9876045673810833
test f-score: 0.9872666584250217


In [236]:
clf_svm_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, svc, "results_svm_title_text")

train accuracy: 0.9941605839416059
test accuracy: 0.9943298969072165
train f-score: 0.9943847656249999
test f-score: 0.9945558030190547


In [237]:
clf_svm_title_text = modelEval(distilBert_title_text, y, 0.33, svc, "results_svm_title_text_p")

train accuracy: 0.9926502815144466
test accuracy: 0.9921971252566735
train f-score: 0.99287441982088
test f-score: 0.9924302788844621


## 3. Linear Support Vector Machine

In [299]:
from sklearn.svm import LinearSVC #l2 penalty

In [303]:
# cross validation
scores = []
X_train, X_test, y_train, y_test = train_test_split(distilBert_title_text_p, y_p, test_size=0.33, random_state=42)
for c in range(1,5):
    lsvmCV = LinearSVC(max_iter = 20000, C=c)
    scores.append(cross_val_score(estimator=lsvmCV, X=X_train, y=y_train, cv=10).mean())

In [304]:
scores #choose c=2

[0.9979054467810812,
 0.9979689791314156,
 0.9979689791314156,
 0.9979689791314156]

In [315]:
lsvc = LinearSVC(max_iter = 10000)
lsvc2 = LinearSVC(max_iter = 20000, C=2)

In [267]:
clf_lsvm_title = modelEval(distilBert_title, y, 0.33, lsvc, "results_lsvm_title")

train accuracy: 0.9732982704561546
test accuracy: 0.9679671457905544
train f-score: 0.9740566037735848
test f-score: 0.9688581314878894


In [268]:
clf_lsvm_title_p = modelEval(distilBert_title_p, y_p, 0.33, lsvc, "results_lsvm_title_p")

train accuracy: 0.9876864487464297
test accuracy: 0.9774484536082474
train f-score: 0.9881360078277887
test f-score: 0.9784030605948414


In [269]:
clf_lsvm_text = modelEval(distilBert_text, y, 0.33, lsvc, "results_lsvm_text")

train accuracy: 0.9976062843464482
test accuracy: 0.9909650924024641
train f-score: 0.9976811783533099
test f-score: 0.9912443618997081


In [272]:
clf_lsvm_text_p = modelEval(distilBert_text_p, y_p, 0.33, lsvc, "results_lsvm_text_p")

train accuracy: 1.0
test accuracy: 0.9945876288659794
train f-score: 1.0
test f-score: 0.9948122529644269


In [275]:
clf_lsvm_title_text = modelEval(distilBert_title_text, y, 0.33, lsvc, "results_lsvm_title_text")

train accuracy: 1.0
test accuracy: 0.9960301163586585
train f-score: 1.0
test f-score: 0.9961553758451545


In [317]:
clf_lsvm_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, lsvc, "results_lsvm_title_text_p")

train accuracy: 1.0
test accuracy: 0.997680412371134
train f-score: 1.0
test f-score: 0.9977755808205635


In [318]:
clf_lsvm_title_text_p_c2 = modelEval(distilBert_title_text_p, y_p, 0.33, lsvc2, "results_lsvm_title_text_p_c2")

train accuracy: 1.0
test accuracy: 0.997680412371134
train f-score: 1.0
test f-score: 0.9977755808205635


## 4. Naive Bayes

In [120]:
from sklearn.naive_bayes import GaussianNB

In [174]:
gnb = GaussianNB()

In [246]:
clf_nb_title = modelEval(distilBert_title, y, 0.33, gnb, "results_nb_title")

train accuracy: 0.8516570580897475
test accuracy: 0.8499657768651608
train f-score: 0.8537914534458695
test f-score: 0.8528859060402684


In [247]:
clf_nb_title_p = modelEval(distilBert_title_p, y_p, 0.33, gnb, "results_nb_title_p")

train accuracy: 0.8703268803554427
test accuracy: 0.8743556701030928
train f-score: 0.8731607375675172
test f-score: 0.8778654641112363


In [248]:
clf_nb_text = modelEval(distilBert_text, y, 0.33, gnb, "results_nb_text")

train accuracy: 0.9221199554971174
test accuracy: 0.9243668720054757
train f-score: 0.924809582709459
test f-score: 0.9270193514298922


In [249]:
clf_nb_text_p = modelEval(distilBert_text_p, y_p, 0.33, gnb, "results_nb_text_p")

train accuracy: 0.9178673437004126
test accuracy: 0.9252577319587629
train f-score: 0.9189426208970183
test f-score: 0.9268046441191318


In [250]:
clf_nb_title_text = modelEval(distilBert_title_text, y, 0.33, gnb, "results_nb_title_text")

train accuracy: 0.9204005259431577
test accuracy: 0.920123203285421
train f-score: 0.9226434258379477
test f-score: 0.9223191106969314


In [251]:
clf_nb_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, gnb, "results_nb_title_text_p")

train accuracy: 0.9294827039035227
test accuracy: 0.9371134020618557
train f-score: 0.9311946491608348
test f-score: 0.938877755511022


## 5 Neural Network

In [162]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.optim as optim
from datetime import datetime
import glob, os

In [410]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, features, labels): #labels will be numpy (n,)
        'Initialization'
        self.features = torch.tensor(features, dtype=torch.long) #instance variables
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = self.features[index]
        y = self.labels[index]

        return X, y

In [441]:
# create model class
class NeuralNetwork(nn.Module):
    """
    Simplr neural network to intake bert
    """
    
    def __init__(self, p):
        super(NeuralNetwork, self).__init__()
        self.hidden = 700
        self.l1 = nn.Linear(p, self.hidden, bias=True)  
        self.l2 = nn.Linear(self.hidden, 2, bias=True) 
        
    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1, 
            nn.Tanh(), 
            nn.Dropout(), 
            self.l2, 
            nn.Tanh()
        )

        return model(x)

In [442]:
# create function for training (similar to hw8)
def train(num_epochs, dataloader, model, criterion, optimizer):
    loss_curve = []
    accuracy_curve = []
    
    for epoch in range(num_epochs): # loop over each epoch
        epoch_loss = 0
        total = 0
        correct = 0
        
        for data in dataloader: # loop over each batch
            embeddings, labels = data
            outputs = model(embeddings.float())
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # save performance
            epoch_loss+=loss.item()

            _, preds = outputs.max(dim=1)
            correct += preds.eq(labels.reshape(len(labels),)).sum() 
            total += float(len(labels))
        
        # calculate epoch stats and save epoch loss and accuracy
        epoch_loss = epoch_loss / len(dataloader)
        epoch_accuracy = correct / total
        loss_curve.append(epoch_loss)
        accuracy_curve.append(epoch_accuracy)
        print('epoch [{}/{}], mean epoch loss:{:.4f}, epoch accuracy:{:.4f}'.format(epoch + 1, num_epochs, epoch_loss, epoch_accuracy))

    return model, loss_curve 

In [443]:
# create datasets and dataloaders
batch_size = 100

dataloader_title = DataLoader(Dataset(distilBert_title, y), batch_size=batch_size, shuffle=True)
dataloader_text = DataLoader(Dataset(distilBert_text, y), batch_size=batch_size, shuffle=True)
dataloader_title_text = DataLoader(Dataset(distilBert_title_text, y), batch_size=batch_size, shuffle=True)

dataloader_title_p = DataLoader(Dataset(distilBert_title_p, y_p), batch_size=batch_size, shuffle=True)
dataloader_text_p = DataLoader(Dataset(distilBert_text_p, y_p), batch_size=batch_size, shuffle=True)
dataloader_title_text_p = DataLoader(Dataset(distilBert_title_text_p, y_p), batch_size=batch_size, shuffle=True)

In [444]:
# initiate other inputs
num_epochs = 10
learning_rate = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork(distilBert_text.shape[1])
model_title_text = NeuralNetwork(distilBert_title_text.shape[1])
criterion = nn.CrossEntropyLoss() 
# criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# train models - only using text and text_title
trained_model, loss_curve = train(num_epochs,dataloader_text,model,criterion,optimizer)

epoch [1/10], mean epoch loss:0.5167, epoch accuracy:0.7509
epoch [2/10], mean epoch loss:0.4831, epoch accuracy:0.7745
epoch [3/10], mean epoch loss:0.4797, epoch accuracy:0.7744
epoch [4/10], mean epoch loss:0.4787, epoch accuracy:0.7751
epoch [5/10], mean epoch loss:0.4760, epoch accuracy:0.7754
epoch [6/10], mean epoch loss:0.4772, epoch accuracy:0.7770


## 6. Random Forest (Not working well, does not make sense to put in embeddings, discard)

In [323]:
from sklearn.ensemble import RandomForestClassifier

In [378]:
# cross validation
scoresRF = []
X_train, X_test, y_train, y_test = train_test_split(distilBert_title_text, y, test_size=0.33, random_state=42)
for c in range(0,20):
    rfCV = RandomForestClassifier()
    scoresRF.append(cross_val_score(estimator=rfCV, X=X_train, y=y_train, cv=10).mean())

In [367]:
# GridSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(distilBert_title_p, y_p, test_size=0.33, random_state=42)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(distilBert_text_p, y_p, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(distilBert_title_text_p, y_p, test_size=0.33, random_state=42)

rf = RandomForestClassifier()
parameters = { 'max_depth':[15,16,17,18,19],'max_features':np.arange(8,10),
              'n_estimators':[500, 1000, 2000],'min_samples_leaf': [5, 8, 10]}
random_grid_title = GridSearchCV(rf, parameters, cv = 5)
random_grid_text = GridSearchCV(rf, parameters, cv = 5)
random_grid_text_title = GridSearchCV(rf, parameters, cv = 5)

In [None]:
rf_title_text = random_grid.fit(X_train, y_train)

In [None]:
rf_title = random_grid_title.fit(X_train_title, y_train_title)

In [None]:
rf_text = random_grid_text.fit(X_train_text, y_train_text)

In [None]:
rf_title_text.best_params_, rf_title.best_params_, rf_text.best_params_

In [374]:
rf = RandomForestClassifier(max_depth=17, max_features=9, min_samples_leaf=10, n_estimators=500)

In [359]:
clf_rf_title = modelEval(distilBert_title, y, 0.33, rf, "results_rf_title")

train accuracy: 0.9706011260577864
test accuracy: 0.9264887063655031
train f-score: 0.9713120147387815
test f-score: 0.9280160857908847


In [360]:
clf_rf_title_p = modelEval(distilBert_title_p, y_p, 0.33, rf, "results_rf_title_p")

train accuracy: 0.9729609647730879
test accuracy: 0.9418814432989691
train f-score: 0.9738457760314342
test f-score: 0.9438705662725576


In [361]:
clf_rf_text = modelEval(distilBert_text, y, 0.33, rf, "results_rf_text")

train accuracy: 0.9853679916388524
test accuracy: 0.9647501711156742
train f-score: 0.9857984293193718
test f-score: 0.9655587507523574


In [362]:
clf_rf_text_p = modelEval(distilBert_text_p, y_p, 0.33, rf, "results_rf_text_p")

train accuracy: 0.9831799428752777
test accuracy: 0.9641752577319588
train f-score: 0.9838404780779317
test f-score: 0.9655428854734756


In [363]:
clf_rf_title_text = modelEval(distilBert_title_text, y, 0.33, rf, "results_rf_title_text")

train accuracy: 0.9893125653214659
test accuracy: 0.9698151950718685
train f-score: 0.9896266239078504
test f-score: 0.9705862735943441


In [364]:
clf_rf_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, rf, "results_rf_title_text_p")

train accuracy: 0.9890193589336719
test accuracy: 0.9698453608247423
train f-score: 0.9894157234628327
test f-score: 0.970953326713009


## 6. Ensemble
https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_probas.html

## 7. Probability Calibration