In [30]:
# Import required libraries
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from pathlib import Path
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


In [31]:
# Read data from CSV file into Pandas DataFrame
news_df = pd.read_csv(Path("data/news.csv"))

# Peek the first 5 rows of the dataframe 
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [32]:
# Get labels series for target value for predictions, text series for features
labels = news_df.label
text = news_df.text

# Peek the first 5 rows of the series
display(labels.head())
display(text.head())

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [33]:
# Split the data into training and testing data 
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=1) 

In [34]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transfor the training set, transorm the testing set using the tfifdVectorizer
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [35]:
# Initialize Classification model PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)

# Fit the model with the training data
pac.fit(tfidf_train, y_train)

# Predict using the test set
y_predictions = pac.predict(tfidf_test)



In [36]:
# Evaluate the model using Accuracy score

acc_score = accuracy_score(y_test, y_predictions)

print(f"The accuracy score for this model is: {acc_score:0.2f}")


The accuracy score for this model is: 0.94


In [37]:
# Evaluate the model with a confusion matrix

confusion_matrix(y_test, y_predictions, labels=["FAKE", "REAL"])

array([[612,  39],
       [ 34, 582]])

In [38]:
# Evaluate the model with a classification report 
report = classification_report(y_test, y_predictions)

print(report)

              precision    recall  f1-score   support

        FAKE       0.95      0.94      0.94       651
        REAL       0.94      0.94      0.94       616

    accuracy                           0.94      1267
   macro avg       0.94      0.94      0.94      1267
weighted avg       0.94      0.94      0.94      1267

