In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk

### Load Data

In [2]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')
true['label'] = 1
fake['label'] = 0
frames = [true.loc[:][:], fake.loc[:][:]]  #Edit this to control the size of dataset
df = pd.concat(frames)
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [3]:
X = df.drop('label', axis=1) 
y = df['label']
df = df.dropna()
df2 = df.copy()
df2 = df2.reset_index(drop = True)

### Tokenization

In [4]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(df2)):
    review = re.sub('[^a-zA-Z]', ' ', df2['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

### Vectorization

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = tfidf_v.fit_transform(corpus).toarray()
y = df2['label']

### Gaussian Naive Bayes Model training

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
import itertools

GNBclassifier = GaussianNB()
GNBclassifier.fit(X_train, y_train)
pred = GNBclassifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("Gaussian Naive Bayes accuracy:   %0.3f" % score)

Gaussian Naive Bayes accuracy:   0.973


### SVC Model Training

In [44]:
SVCclassifier = SVC()
SVCclassifier.fit(X_train, y_train)
pred = SVCclassifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("SVC accuracy:   %0.3f" % score)

SVC accuracy:   1.000


### Passive Aggressive Classifier Model Training

In [45]:
PACclassifier = PassiveAggressiveClassifier(max_iter=1000)
PACclassifier.fit(X_train, y_train)
pred = PACclassifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("Passive Aggressive Classifier accuracy:   %0.3f" % score)

Passive Aggressive Classifier accuracy:   0.998


### Ensemble Learning
Applying bootstrap aggregation to the ML models


In [46]:
def bootstrap_sample(x_data, y_data):
    indices = np.random.choice(len(x_data), size=len(x_data)//2, replace=True)
    X_sample = X[indices]
    y_sample = y[indices]
    return X_sample, y_sample

X_train_1, y_train_1 = bootstrap_sample(X_train, y_train)
X_train_2, y_train_2 = bootstrap_sample(X_train, y_train)
X_train_3, y_train_3 = bootstrap_sample(X_train, y_train)


In [52]:
from scipy.stats import mode

def majority_voting(predictions):
    majority_vote = mode(predictions)[0]
    return majority_vote[0]

GNBclassifier = GaussianNB()
GNBclassifier.fit(X_train_1, y_train_1)
pred1 = GNBclassifier.predict(X_test)

SVCclassifier = SVC()
SVCclassifier.fit(X_train_2, y_train_2)
pred2 = SVCclassifier.predict(X_test)

PACclassifier = PassiveAggressiveClassifier(max_iter=1000)
PACclassifier.fit(X_train_3, y_train_3)
pred3 = PACclassifier.predict(X_test)

all_predictions = [pred1, pred2, pred3]
final_vote = majority_voting(all_predictions)

ensemble_score = metrics.accuracy_score(y_test, final_vote)
print("Ensemble accuracy: %.3f" % ensemble_score)


Ensemble accuracy: 0.980
