<a href="https://colab.research.google.com/github/tdraths/fake_news_detection/blob/main/fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import itertools

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
df = pd.read_csv('news.csv', engine='python', index_col=0)
df.head()

Unnamed: 0,title,text,label
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
df.shape

(6335, 3)

In [None]:
df.columns

Index(['title', 'text', 'label'], dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6335 entries, 8476 to 4330
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   6335 non-null   object
 1   text    6335 non-null   object
 2   label   6335 non-null   object
dtypes: object(3)
memory usage: 198.0+ KB


In [None]:
df.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [None]:
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)

X_train_vector = tfidf_vectorizer.fit_transform(X_train)
X_test_vector = tfidf_vectorizer.transform(X_test)

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train_vector, y_train)

y_pred = pac.predict(X_test_vector)
score = accuracy_score(y_test, y_pred)

print(f'Accuracy: {round(score*100, 2)}%')

Accuracy: 93.76%


In [None]:
confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

array([[589,  39],
       [ 40, 599]])

In [None]:
import pickle

filename = 'finalized_model_fake_news_class.sav'
pickle.dump(pac, open(filename, 'wb'))