# Import Data and subset Data

## 1. Import

In [88]:
import numpy as np
import pandas as pd
import sklearn

In [108]:
news = pd.read_csv("../data_intermed/news_bert.csv")
news = news[news['text']!=" "] # remove empty entries

In [148]:
distilBert_title = np.load('../data_intermed/distilBert_title.npy')
distilBert_text = np.load('../data_intermed/distilBert_text.npy')

In [109]:
def createLabels(data, col_name):
    labels = data[col_name].values
    y = np.zeros(labels.shape)
    y[labels == 'fake'] = 1
    return y

In [149]:
distilBert_title = distilBert_title[news.index]
distilBert_text = distilBert_text[news.index]
y = createLabels(news, 'label')

print(distilBert_text.shape)
print(y.sum(), len(y))

(44271, 768)
22855.0 44271


## 2. Subset Data to only politics

In [150]:
news.subject.unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [151]:
news_p = news[news['subject'].isin(['politicsNews','politics','Government News','left-news'])]

In [153]:
news_p.size
news_p.groupby('label').count().title

label
fake    12244
true    11271
Name: title, dtype: int64

In [154]:
distilBert_title_p = distilBert_title[news_p.index]
distilBert_text_p = distilBert_text[news_p.index]
y_p = createLabels(news_p, 'label')

print(distilBert_text_p.shape)
print(y_p.sum(), len(y))

(23515, 768)
12244.0 44271


## 3. Combine title and text

In [155]:
distilBert_mean = (distilBert_title + distilBert_text)/2
distilBert_mean_p = (distilBert_title_p + distilBert_text_p)/2

In [156]:
distilBert_title_text = np.concatenate((distilBert_title, distilBert_text),1)
distilBert_title_text_p = np.concatenate((distilBert_title_p, distilBert_text_p),1)

# Models

In [196]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import json

In [223]:
def modelEval(X, y, test_perc, model, result_output_name, path = "../model_results/"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = model.fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # save results in dictionary
    model_dict = {}
    model_dict['train accuracy'] = clf.score(X_train, y_train)
    model_dict['test accuracy'] = clf.score(X_test, y_test)
    model_dict['train f-score'] = f1_score(y_train, y_pred_train)
    model_dict['test f-score'] = f1_score(y_test, y_pred_test)
    
    # output the dictionary
    with open(path + result_output_name + ".json", "w") as outfile:  
        json.dump(model_dict, outfile) 
    
    # print
    print('train accuracy:', model_dict['train accuracy'])
    print('test accuracy:', model_dict['test accuracy'])
    print('train f-score:', model_dict['train f-score'])
    print('test f-score:', model_dict['test f-score'])
    
    return clf, model_dict

## 1. Logistic Regression

In [136]:
from sklearn.linear_model import LogisticRegression

In [184]:
lr = LogisticRegression(random_state=0, max_iter = 2000)

In [222]:
clf_lr_title = modelEval(distilBert_title, y, 0.33, lr, "results_lr_title")

train accuracy: 0.9698931256532146
test accuracy: 0.9671457905544147
train f-score: 0.970712669312256
test f-score: 0.9680468645985887


In [224]:
clf_lr_title_p = modelEval(distilBert_title_p, y_p, 0.33, lr, "results_lr_title_p")

train accuracy: 0.9794985718819422
test accuracy: 0.9775773195876288
train f-score: 0.9802022678516702
test f-score: 0.9784119106699752


In [225]:
clf_lr_text = modelEval(distilBert_text, y, 0.33, lr, "results_lr_text")

train accuracy: 0.9925154242945282
test accuracy: 0.9888432580424367
train f-score: 0.9927417772837246
test f-score: 0.9891658358258558


In [226]:
clf_lr_text_p = modelEval(distilBert_text_p, y_p, 0.33, lr, "results_lr_text_p")

train accuracy: 0.9964455728340209
test accuracy: 0.993298969072165
train f-score: 0.9965841161400513
test f-score: 0.993570722057369


In [228]:
clf_lr_title_text = modelEval(distilBert_title_text, y, 0.33, lr, "results_lr_title_text")

train accuracy: 0.998550284885877
test accuracy: 0.9958932238193019
train f-score: 0.998595918367347
test f-score: 0.9960180514998673


In [227]:
clf_lr_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, lr, "results_lr_title_text_p")

train accuracy: 0.9997461123452872
test accuracy: 0.9978092783505155
train f-score: 0.9997560380580629
test f-score: 0.9978989000123594


## 2. Support Vector Machine

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [169]:
svc = SVC()

In [229]:
clf_svm_title = modelEval(distilBert_title, y, 0.33, svc, "results_svm_title")

train accuracy: 0.9607902633087219
test accuracy: 0.9605749486652977
train f-score: 0.9617169755423154
test f-score: 0.961456102783726


In [233]:
clf_svm_title_p = modelEval(distilBert_title_p, y_p, 0.33, svc, "results_svm_title_p")

train accuracy: 0.967185020628372
test accuracy: 0.9686855670103093
train f-score: 0.96811987420608
test f-score: 0.9696212026503312


In [234]:
clf_svm_text = modelEval(distilBert_text, y, 0.33, svc, "results_svm_text")

train accuracy: 0.9856377060786892
test accuracy: 0.9835728952772074
train f-score: 0.9860428543345785
test f-score: 0.9840234322992943


In [235]:
clf_svm_text_p = modelEval(distilBert_text_p, y_p, 0.33, svc, "results_svm_text_p")

train accuracy: 0.9871152015233259
test accuracy: 0.9867268041237114
train f-score: 0.9876045673810833
test f-score: 0.9872666584250217


In [236]:
clf_svm_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, svc, "results_svm_title_text")

train accuracy: 0.9941605839416059
test accuracy: 0.9943298969072165
train f-score: 0.9943847656249999
test f-score: 0.9945558030190547


In [237]:
clf_svm_title_text = modelEval(distilBert_title_text, y, 0.33, svc, "results_svm_title_text_p")

train accuracy: 0.9926502815144466
test accuracy: 0.9921971252566735
train f-score: 0.99287441982088
test f-score: 0.9924302788844621


## 3. Linear Support Vector Machine

In [238]:
from sklearn.svm import LinearSVC #l2 penalty

In [266]:
lsvc = LinearSVC(max_iter = 10000)

In [267]:
clf_lsvm_title = modelEval(distilBert_title, y, 0.33, lsvc, "results_lsvm_title")

train accuracy: 0.9732982704561546
test accuracy: 0.9679671457905544
train f-score: 0.9740566037735848
test f-score: 0.9688581314878894


In [268]:
clf_lsvm_title_p = modelEval(distilBert_title_p, y_p, 0.33, lsvc, "results_lsvm_title_p")

train accuracy: 0.9876864487464297
test accuracy: 0.9774484536082474
train f-score: 0.9881360078277887
test f-score: 0.9784030605948414


In [269]:
clf_lsvm_text = modelEval(distilBert_text, y, 0.33, lsvc, "results_lsvm_text")

train accuracy: 0.9976062843464482
test accuracy: 0.9909650924024641
train f-score: 0.9976811783533099
test f-score: 0.9912443618997081


In [272]:
clf_lsvm_text_p = modelEval(distilBert_text_p, y_p, 0.33, lsvc, "results_lsvm_text_p")

train accuracy: 1.0
test accuracy: 0.9945876288659794
train f-score: 1.0
test f-score: 0.9948122529644269


In [273]:
clf_lsvm_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, lsvc, "results_lsvm_title_text")

train accuracy: 1.0
test accuracy: 0.997680412371134
train f-score: 1.0
test f-score: 0.9977755808205635


In [274]:
clf_lsvm_title_text = modelEval(distilBert_title_text, y, 0.33, lsvc, "results_lsvm_title_text_p")

train accuracy: 1.0
test accuracy: 0.9960301163586585
train f-score: 1.0
test f-score: 0.9961553758451545


## 4. Naive Bayes

In [120]:
from sklearn.naive_bayes import GaussianNB

In [174]:
gnb = GaussianNB()

In [246]:
clf_nb_title = modelEval(distilBert_title, y, 0.33, gnb, "results_nb_title")

train accuracy: 0.8516570580897475
test accuracy: 0.8499657768651608
train f-score: 0.8537914534458695
test f-score: 0.8528859060402684


In [247]:
clf_nb_title_p = modelEval(distilBert_title_p, y_p, 0.33, gnb, "results_nb_title_p")

train accuracy: 0.8703268803554427
test accuracy: 0.8743556701030928
train f-score: 0.8731607375675172
test f-score: 0.8778654641112363


In [248]:
clf_nb_text = modelEval(distilBert_text, y, 0.33, gnb, "results_nb_text")

train accuracy: 0.9221199554971174
test accuracy: 0.9243668720054757
train f-score: 0.924809582709459
test f-score: 0.9270193514298922


In [249]:
clf_nb_text_p = modelEval(distilBert_text_p, y_p, 0.33, gnb, "results_nb_text_p")

train accuracy: 0.9178673437004126
test accuracy: 0.9252577319587629
train f-score: 0.9189426208970183
test f-score: 0.9268046441191318


In [250]:
clf_nb_title_text = modelEval(distilBert_title_text, y, 0.33, gnb, "results_nb_title_text")

train accuracy: 0.9204005259431577
test accuracy: 0.920123203285421
train f-score: 0.9226434258379477
test f-score: 0.9223191106969314


In [251]:
clf_nb_title_text_p = modelEval(distilBert_title_text_p, y_p, 0.33, gnb, "results_nb_title_text_p")

train accuracy: 0.9294827039035227
test accuracy: 0.9371134020618557
train f-score: 0.9311946491608348
test f-score: 0.938877755511022


## 5. Neural Network

## 6. Ensemble
https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_probas.html

## 7. Probability Calibration