In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [12]:
## Reading in the cleaned news article csv file, setting index to the Unnamed index column
news_df = pd.read_csv('../Resources/clean_output.csv', index_col='Unnamed: 0')

In [13]:
news_df
#0 = real  1= fake

Unnamed: 0,cleaned_title,cleaned_text,target
0,smell hillarys fear,daniel greenfield shillman journalism fellow f...,1
1,watch exact moment paul ryan committed politic...,google pinterest digg linkedin reddit stumbleu...,1
2,kerry go paris gesture sympathy,yous secretary state john f kerry said monday ...,0
3,bernie supporters twitter erupt anger dnc trie...,kaydee king kaydeeking november 9 2016 lesson ...,1
4,battle new york primary matters,primary day new york frontrunners hillary clin...,0
...,...,...,...
51228,fully committed nato backs new yous approach a...,brussels reuters nato allies tuesday welcomed ...,0
51229,lexisnexis withdrew two products chinese market,london reuters lexisnexis provider legal regul...,0
51230,minsk cultural hub becomes authorities,minsk reuters shadow disused sovietera factori...,0
51231,vatican upbeat possibility pope francis visiti...,moscow reuters vatican secretary state cardina...,0


### Testing our dataset on the Linear Support Vector Classification model using the article's body text

In [14]:
## Setting our feature (Article Text) and target (Fake = 1 or Real = 0 ) for the model
X = news_df['cleaned_text'].astype('U')
y= news_df['target']

In [15]:
## Splitting the preprocessed data into a training and testing dataset

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [16]:
# Size of our training data vs testing data
print(len(X_train))
print(len(X_test))

38424
12809


In [17]:
## Creating vectorizing instance (It considers overall document weightage of a word. It helps us in dealing with most frequent words. 
# Using it we can penalize them. TfidfVectorizer weights the word counts by a measure of how often they appear in the documents.)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [18]:
#Creating an instance of Linear SVC model and fitting it to vectorized training data
clf = LinearSVC()
clf.fit(X_train_vectorized,y_train)

In [19]:
## Linear Support Vector Classifier Accuracy score
clf.score(X_test_vectorized,y_test)

0.9772815988757905

In [20]:
## Creating a confusion matrix and printing its results based on the predictions made using vectorized data
X_test_vectorized_predict = clf.predict(X_test_vectorized)
cm = confusion_matrix(y_test, X_test_vectorized_predict)
cm_df = pd.DataFrame(
    cm, index=["Real", "Fake"], columns=["Predicted Real", "Predicted Fake"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, X_test_vectorized_predict)

In [21]:
# Displaying results of our model
print("Confusion Matrix")
display(cm_df)
print(f"\nAccuracy Score : {acc_score}")
print("\nClassification Report\n")
print(classification_report(y_test, X_test_vectorized_predict,target_names=['real','fake']))

Confusion Matrix


Unnamed: 0,Predicted Real,Predicted Fake
Real,5932,193
Fake,98,6586



Accuracy Score : 0.9772815988757905

Classification Report

              precision    recall  f1-score   support

        real       0.98      0.97      0.98      6125
        fake       0.97      0.99      0.98      6684

    accuracy                           0.98     12809
   macro avg       0.98      0.98      0.98     12809
weighted avg       0.98      0.98      0.98     12809



### Testing our dataset on the Linear Support Vector Classification model using the article's titles

In [22]:
## Setting our feature (Article Title) and target (Fake = 1 or Real = 0 ) for the model
X2 = news_df['cleaned_title'].astype('U')
y= news_df['target']

In [23]:
## Splitting the preprocessed data into a training and testing dataset

X2_train, X2_test, y_train, y_test = train_test_split(X2,y)

In [24]:
# Size of our training data vs testing data
print(len(X2_train))
print(len(X2_test))

38424
12809


In [25]:
## Creating vectorizing instance (It considers overall document weightage of a word. It helps us in dealing with most frequent words. 
# Using it we can penalize them. TfidfVectorizer weights the word counts by a measure of how often they appear in the documents.)
vectorizer = TfidfVectorizer(stop_words='english')
X2_train_vectorized = vectorizer.fit_transform(X2_train)
X2_test_vectorized = vectorizer.transform(X2_test)

In [26]:
X2_train_vectorized.shape

(38424, 25597)

In [27]:
#Creating an instance of Linear SVC model and fitting it to vectorized training data
clf = LinearSVC()
clf.fit(X2_train_vectorized,y_train)

In [28]:
## Linear Support Vector Classifier Accuracy score
clf.score(X2_test_vectorized,y_test)

0.9138106019205247

In [29]:
# Creating and saving the testing classification report based on the predictions made using vectorized data
X2_test_vectorized_predict = clf.predict(X2_test_vectorized)

              precision    recall  f1-score   support

        real       0.91      0.91      0.91      6092
        fake       0.92      0.91      0.92      6717

    accuracy                           0.91     12809
   macro avg       0.91      0.91      0.91     12809
weighted avg       0.91      0.91      0.91     12809



In [30]:
## Creating a confusion matrix and printing its results based on the predictions made using vectorized data
cm = confusion_matrix(y_test, X2_test_vectorized_predict)
cm_df = pd.DataFrame(
    cm, index=["Real", "Fake"], columns=["Predicted Real", "Predicted Fake"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, X2_test_vectorized_predict)

In [31]:
# Displaying results of our model
print("Confusion Matrix")
display(cm_df)
print(f"\nAccuracy Score : {acc_score}")
print("\nClassification Report\n")
print(classification_report(y_test, X2_test_vectorized_predict,target_names=['real','fake']))

Confusion Matrix


Unnamed: 0,Predicted Real,Predicted Fake
Real,5561,531
Fake,573,6144



Accuracy Score : 0.9138106019205247

Classification Report

              precision    recall  f1-score   support

        real       0.91      0.91      0.91      6092
        fake       0.92      0.91      0.92      6717

    accuracy                           0.91     12809
   macro avg       0.91      0.91      0.91     12809
weighted avg       0.91      0.91      0.91     12809

