# Import Data and subset Data

## 1. Import

In [88]:
import numpy as np
import pandas as pd
import sklearn

In [108]:
news = pd.read_csv("../data_intermed/news_bert.csv")
news = news[news['text']!=" "] # remove empty entries

In [148]:
distilBert_title = np.load('../data_intermed/distilBert_title.npy')
distilBert_text = np.load('../data_intermed/distilBert_text.npy')

In [109]:
def createLabels(data, col_name):
    labels = data[col_name].values
    y = np.zeros(labels.shape)
    y[labels == 'fake'] = 1
    return y

In [149]:
distilBert_title = distilBert_title[news.index]
distilBert_text = distilBert_text[news.index]
y = createLabels(news, 'label')

print(distilBert_text.shape)
print(y.sum(), len(y))

(44271, 768)
22855.0 44271


## 2. Subset Data to only politics

In [150]:
news.subject.unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [151]:
news_p = news[news['subject'].isin(['politicsNews','politics','Government News','left-news'])]

In [153]:
news_p.size
news_p.groupby('label').count().title

label
fake    12244
true    11271
Name: title, dtype: int64

In [154]:
distilBert_title_p = distilBert_title[news_p.index]
distilBert_text_p = distilBert_text[news_p.index]
y_p = createLabels(news_p, 'label')

print(distilBert_text_p.shape)
print(y_p.sum(), len(y))

(23515, 768)
12244.0 44271


## 3. Combine title and text

In [155]:
distilBert_mean = (distilBert_title + distilBert_text)/2
distilBert_mean_p = (distilBert_title_p + distilBert_text_p)/2

In [156]:
distilBert_title_text = np.concatenate((distilBert_title, distilBert_text),1)
distilBert_title_text_p = np.concatenate((distilBert_title_p, distilBert_text_p),1)

# Logistic Regression

In [136]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [116]:
def lrModelEval(X, y, test_perc):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = LogisticRegression(random_state=0, max_iter = 2000).fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # evaluate
    print('training accuracy:', clf.score(X_train, y_train))
    print('test accuracy:', clf.score(X_test, y_test))
    print('train f-score:', f1_score(y_train, y_pred_train))
    print('test f-score:', f1_score(y_test, y_pred_test))
    
    return clf

In [117]:
clf_title = lrModelEval(distilBert_title, y, 0.33)

training accuracy: 0.9698931256532146
test accuracy: 0.9671457905544147
train f-score: 0.970712669312256
test f-score: 0.9680468645985887


In [118]:
clf_title_p = lrModelEval(distilBert_title_p, y_p, 0.33)

training accuracy: 0.9794985718819422
test accuracy: 0.9775773195876288
train f-score: 0.9802022678516702
test f-score: 0.9784119106699752


In [133]:
clf_text = lrModelEval(distilBert_text, y, 0.33)

training accuracy: 0.9925154242945282
test accuracy: 0.9888432580424367
train f-score: 0.9927417772837246
test f-score: 0.9891658358258558


In [132]:
clf_text_p = lrModelEval(distilBert_text_p, y_p, 0.33)

training accuracy: 0.9964455728340209
test accuracy: 0.993298969072165
train f-score: 0.9965841161400513
test f-score: 0.993570722057369


In [157]:
clf_title_text = lrModelEval(distilBert_title_text, y, 0.33)

training accuracy: 0.998550284885877
test accuracy: 0.9958932238193019
train f-score: 0.998595918367347
test f-score: 0.9960180514998673


In [158]:
clf_title_text_p = lrModelEval(distilBert_title_text_p, y_p, 0.33)

training accuracy: 0.9997461123452872
test accuracy: 0.9978092783505155
train f-score: 0.9997560380580629
test f-score: 0.9978989000123594


# Support Vector Machine

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [126]:
def svcModelEval(X, y, test_perc):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = SVC().fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # evaluate
    print('training accuracy:', clf.score(X_train, y_train))
    print('test accuracy:', clf.score(X_test, y_test))
    print('train f-score:', f1_score(y_train, y_pred_train))
    print('test f-score:', f1_score(y_test, y_pred_test))
    
    return clf

In [121]:
clf_svm_title = svcModelEval(distilBert_title, y, 0.33)

training accuracy: 0.9607902633087219
test accuracy: 0.9605749486652977
train f-score: 0.9617169755423154
test f-score: 0.961456102783726


In [122]:
clf_svm_title = svcModelEval(distilBert_title_p, y_p, 0.33)

training accuracy: 0.967185020628372
test accuracy: 0.9686855670103093
train f-score: 0.96811987420608
test f-score: 0.9696212026503312


In [123]:
clf_svm_text = svcModelEval(distilBert_text, y, 0.33)

training accuracy: 0.9856377060786892
test accuracy: 0.9835728952772074
train f-score: 0.9860428543345785
test f-score: 0.9840234322992943


In [124]:
clf_svm_text_p = svcModelEval(distilBert_text_p, y_p, 0.33)

training accuracy: 0.9871152015233259
test accuracy: 0.9867268041237114
train f-score: 0.9876045673810833
test f-score: 0.9872666584250217


# Naive Bayes

In [120]:
from sklearn.naive_bayes import GaussianNB

In [127]:
def nbModelEval(X, y, test_perc):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = GaussianNB().fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # evaluate
    print('training accuracy:', clf.score(X_train, y_train))
    print('test accuracy:', clf.score(X_test, y_test))
    print('train f-score:', f1_score(y_train, y_pred_train))
    print('test f-score:', f1_score(y_test, y_pred_test))
    
    return clf

In [128]:
clf_nb_title = nbModelEval(distilBert_title, y, 0.33)

training accuracy: 0.8516570580897475
test accuracy: 0.8499657768651608
train f-score: 0.8537914534458695
test f-score: 0.8528859060402684


In [129]:
clf_nb_title_p = nbModelEval(distilBert_title_p, y_p, 0.33)

training accuracy: 0.8703268803554427
test accuracy: 0.8743556701030928
train f-score: 0.8731607375675172
test f-score: 0.8778654641112363


In [130]:
clf_nb_text = nbModelEval(distilBert_text, y, 0.33)

training accuracy: 0.9221199554971174
test accuracy: 0.9243668720054757
train f-score: 0.924809582709459
test f-score: 0.9270193514298922


In [131]:
clf_nb_text_p = nbModelEval(distilBert_text_p, y_p, 0.33)

training accuracy: 0.9178673437004126
test accuracy: 0.9252577319587629
train f-score: 0.9189426208970183
test f-score: 0.9268046441191318
