In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
## Reading in the cleaned news article csv file, setting index to the Unnamed index column
news_df = pd.read_csv('../Ivan/clean_output.csv', index_col='Unnamed: 0')

In [3]:
news_df
#0 = real  1= fake

Unnamed: 0,cleaned_title,cleaned_text,target
0,smell hillarys fear,daniel greenfield shillman journalism fellow f...,1
1,watch exact moment paul ryan committed politic...,google pinterest digg linkedin reddit stumbleu...,1
2,kerry go paris gesture sympathy,yous secretary state john f kerry said monday ...,0
3,bernie supporters twitter erupt anger dnc trie...,kaydee king kaydeeking november 9 2016 lesson ...,1
4,battle new york primary matters,primary day new york frontrunners hillary clin...,0
...,...,...,...
6330,state department says cannot find emails clint...,state department told republican national comm...,0
6331,p pbs stand plutocratic pentagon,p pbs stand plutocratic pentagon posted oct 27...,1
6332,antitrump protesters tools oligarchy information,antitrump protesters tools oligarchy reform al...,1
6333,ethiopia obama seeks progress peace security e...,addis ababa ethiopia president obama convened ...,0


Testing our dataset on the Linear Support Vector Classification model using the article's body text

In [4]:
## Setting our feature (Article Text) and target (Fake = 1 or Real = 0 ) for the model
X = news_df['cleaned_text'].astype('U')
y= news_df['target']

In [5]:
## Split the preprocessed data into a training and testing dataset

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [6]:
# Size of our training data vs testing data
print(len(X_train))
print(len(X_test))

4751
1584


In [7]:
## Creating vectorizing instance (It considers overall document weightage of a word. It helps us in dealing with most frequent words. 
# Using it we can penalize them. TfidfVectorizer weights the word counts by a measure of how often they appear in the documents.)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [8]:
#Creating an instance of Linear SVC model and fitting it to vectorized training data
clf = LinearSVC()
clf.fit(X_train_vectorized,y_train)

In [9]:
## Linear Support Vector Classifier Accuracy score
clf.score(X_test_vectorized,y_test)

0.9438131313131313

In [10]:
## Creating a confusion matrix and printing its results based on the predictions made using vectorized data
X_test_vectorized_predict = clf.predict(X_test_vectorized)
cm = confusion_matrix(y_test, X_test_vectorized_predict)
print(cm)

[[724  54]
 [ 35 771]]


In [11]:
# Creating and saving the testing classification report based on the predictions made using vectorized data
testing_report = classification_report(y_test, X_test_vectorized_predict)

# Printing the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       778
           1       0.93      0.96      0.95       806

    accuracy                           0.94      1584
   macro avg       0.94      0.94      0.94      1584
weighted avg       0.94      0.94      0.94      1584



Testing our dataset on the Linear Support Vector Classification model using the article's titles

In [20]:
## Setting our feature (Article Title) and target (Fake = 1 or Real = 0 ) for the model
X2 = news_df['cleaned_title'].astype('U')
y= news_df['target']

In [21]:
## Split the preprocessed data into a training and testing dataset

X2_train, X2_test, y_train, y_test = train_test_split(X,y)

In [22]:
# Size of our training data vs testing data
print(len(X_train))
print(len(X_test))

4751
1584


In [23]:
## Creating vectorizing instance (It considers overall document weightage of a word. It helps us in dealing with most frequent words. 
# Using it we can penalize them. TfidfVectorizer weights the word counts by a measure of how often they appear in the documents.)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [24]:
#Creating an instance of Linear SVC model and fitting it to vectorized training data
clf = LinearSVC()
clf.fit(X_train_vectorized,y_train)

In [25]:
## Linear Support Vector Classifier Accuracy score
clf.score(X_test_vectorized,y_test)

0.514520202020202

In [26]:
# Creating and saving the testing classification report based on the predictions made using vectorized data
testing_report = classification_report(y_test, X_test_vectorized_predict)

# Printing the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.50      0.49      0.50       772
           1       0.53      0.54      0.53       812

    accuracy                           0.52      1584
   macro avg       0.52      0.52      0.52      1584
weighted avg       0.52      0.52      0.52      1584

