In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# load the data
df = pd.read_csv("/fake news detector/news.csv")
df.shape

(6335, 4)

In [3]:
df.shape

(6335, 4)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
# get the labels of the data
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size = 0.2, random_state = 42)

In [7]:
X_train.shape

(5068,)

In [8]:
# initialize a TfidfVectorizer
# stop_words are the words you want to filter out in the data.
# max_df sets the maximum of frequency, any common words appear more than 70% of the time would be filter out. 
vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
# fit will calculate the mean and standard deviation, then transform will make the data standardized (x-µ/σ)
# then we will need to use the same mean and sd of the training set on test set.
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
# We use the test dataset to get a good estimate of how our model performs on any new data.
# The new unseen data could be just 1 data point
# That’s why we need to keep and use the training data parameters for scaling the test set.

In [9]:
# each row is a text, and each column the value whether a unique word in all the text exist in this row of text.
X_train.shape

(5068, 61359)

In [10]:
vectorizer.get_feature_names_out().shape

(61359,)

In [11]:
# initialize a PassiveAggressiveClassifier
# PAC is useful when there is a huge amount of data and it is computationally infeasible to train the entire dataset.
# max_iter is the maximum number of passes over the training data (aka epochs)
# Assume 200 samples, batch size is 5. Then you will have 40 batches. 
# Weights will be updated after each batch of 5 samples. 
# One epoch will involve 40 batches or 40 updates to the model.
# With 50 epochs, the model will pass through the whole dataset 50 times.
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [18]:
# use the pac model to predict the outcomes of X_test
y_pred=pac.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.53%


In [22]:
# get the confusion_matrix between y_test(the actual output of the test set) and y_pred(the predicted output)
confusion_matrix(y_test, y_pred, labels=['FAKE','REAL'])
# TP = 589, TN = 587, FP = 42, and FN = 49.
# Accuracy Score = (TP+TN)/ (TP+FN+TN+FP)
# TP is the predicted is fake news and actual is also fake news.

array([[590,  38],
       [ 44, 595]])