# Load np for embeddings

In [10]:
import numpy as np
import pandas as pd

In [7]:
distilBert_text = np.load('./intermed/distilBert_text.npy')
distilBert_title = np.load('./intermed/distilBert_title.npy')

In [12]:
news = pd.read_csv("./intermed/news.csv")

In [81]:
labels = news.label.values
y = np.zeros(labels.shape)
y[labels == 'fake'] = 1
y

array([0., 0., 0., ..., 1., 1., 1.])

# Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [133]:
def lrModelEval(X, y, test_perc):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_perc, random_state=42)
    clf = LogisticRegression(random_state=0, max_iter = 1000).fit(X_train, y_train)
    
    # predict
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    assert(y_pred_train.shape == y_train.shape)
    assert(y_pred_test.shape == y_test.shape)
    
    # evaluate
    print('training accuracy:', clf.score(X_train, y_train))
    print('test accuracy:', clf.score(X_test, y_test))
    print('train f-score:', f1_score(y_train, y_pred_train))
    print('test f-score:', f1_score(y_test, y_pred_test))
    
    return clf

## 1.1. using only title as feature

In [135]:
clf_title = lrModelEval(distilBert_title, y, 0.33)

training accuracy: 0.973039460124331
test accuracy: 0.9695619896065331
train f-score: 0.9741629233170856
test f-score: 0.970674296118083


## 1.2. using only text body as feature

In [136]:
clf_text = lrModelEval(distilBert_text, y, 0.33)

training accuracy: 0.992221003291114
test accuracy: 0.9904164135789971
train f-score: 0.992566709021601
test f-score: 0.9907911802853437
