### Importing libraries 

In [1]:
# Ignore warning messages
# def warn(*args, **kwargs):
#     pass
# import warnings
# warnings.warn = warn

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Computations
import itertools

# saving models
import joblib

# Modelling Algorithms
from sklearn.naive_bayes import MultinomialNB

# Modelling Helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the Data

source: https://www.kaggle.com/c/fake-news/data

train.csv: A full training dataset with the following attributes:

+ id: unique id for a news article
+ title: the title of a news article
+ author: author of the news article
+ text: the text of the article; could be incomplete
+ label: a label that marks the article as potentially unreliable
    - 1: unreliable
    - 0: reliable

test.csv: A testing training dataset with all the same attributes at train.csv without the label.
submit.csv: A sample submission that you can

In [2]:
train_data = pd.read_csv('data/train.csv.zip') # This is the data we will be using to train/bulid our models
test_data = pd.read_csv('data/test.csv.zip') # This is the data we will use to evalute our model at the end.
test_labels = pd.read_csv('data/submit.csv') # This is the actual/correct label(y) we will be using to do our comparism

In [3]:
train_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [5]:
test_labels.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [6]:
train_data.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [7]:
test_data.columns

Index(['id', 'title', 'author', 'text'], dtype='object')

In [8]:
test_labels.columns

Index(['id', 'label'], dtype='object')

In [9]:
train_data.shape

(20800, 5)

In [10]:
test_data.shape

(5200, 4)

In [11]:
test_labels.shape

(5200, 2)

## Data Preprocessing

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [13]:
train_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [14]:
test_data.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

### Narrative: 
+ This shows that in the train_data, we have some missing values. we have 558 rows with missing news titles, 1957 rows with missing authors and 39 rows with missing news text/content

+ In the test_data, we have 112 rows with missing news titles, 503 rows with missing authors and 7 rows with missing news text/content 

Handling missing data
    
    This we will do by replacing/filling the missing datas with space ie. (' ')

In [15]:
train_data.fillna(' ', inplace=True,)
test_data.fillna(' ', inplace=True,)

In [16]:
train_data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [17]:
test_data.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [18]:
y =  train_data.label # class label/ target

In [19]:
X = train_data.drop(columns = 'label') # Features

We will be creating a new feature "news" which will be the concatenation of the already existing features (ie. "title", "author" and "text")

In [20]:
#Concatenating/combining title,author and main text to form a new feature called news
X['news'] = X['author'] + ' '+ X['title']+ ' ' + X['text']
test_data['news'] = test_data['author'] + ' '+ test_data['title'] + ' ' +test_data['text']

In [21]:
X.columns

Index(['id', 'title', 'author', 'text', 'news'], dtype='object')

In [22]:
X.head()

Unnamed: 0,id,title,author,text,news
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...


In [23]:
#spliting the data into train and test data we only using the news feature which is a combination of the other features.
X_train, X_test, y_train, y_test = train_test_split(X['news'],
                                                    y, test_size=0.20, random_state=0)

## Machine Learning Models with Count Vectorizer and TF-IDF Vectorizer

+ The vectorization is to convect the text data to vectors after which our different machine learning models can be applied.

+ For the vectorizing, we will be using two vectorizing algorithms. we will build our models with both after which we compare their accuracy score to see which performed better. and the one with the best perrfomance will be our final model. 

+ The two vectorizing algorithm we will be using are:
    - Count Vectorizer also known as Bag of Words, and
    - Term Frequency–Inverse Document Frequency also known as TF-IDF

#### Using Naive Bayes ML model with Count Vectorizer

In [24]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english') 
# Fit and transform the training data.
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [25]:
# using MultinomialNB with the count vectorized data
nb_classifier = MultinomialNB(alpha = 0.15)
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=0.15)

In [26]:
# checking the accurracy with the model
pred_count = nb_classifier.predict(count_test)
acc_count = metrics.accuracy_score(y_test, pred_count)
print(acc_count)

0.9415865384615385


#### Using Naive Bayes ML model with TF-IDF Vectorizer

In [27]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
#Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
#Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [28]:
# using MultinomialNB with the count vectorized data
nb_classifier = MultinomialNB(alpha = 0.01)
nb_classifier.fit(tfidf_train, y_train)

MultinomialNB(alpha=0.01)

In [29]:
pred_tfidf = nb_classifier.predict(tfidf_test)
acc_tfidf = metrics.accuracy_score(y_test, pred_tfidf)
print(acc_tfidf)

0.9432692307692307


#### Narative:
+ from the above, using tf-idf vectorizer performed slightly better than count vectorizer
+ hence we will be using it for our final build
+ For the final build, we will be using the Pipeline function from sklearn to pass both the vectorizer and build our model all in one line
    

In [30]:
NB_with_countvectorizer = Pipeline([('TF-IDF vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
                 ('MultinomialNB', MultinomialNB(alpha=0.15))])
# Fit the training data to the model
NB_with_countvectorizer.fit(X_train, y_train)

Pipeline(steps=[('TF-IDF vectorizer',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('MultinomialNB', MultinomialNB(alpha=0.15))])

In [31]:
# Saving our models
with open('models/fake_news_nb_model', 'wb') as f:
    joblib.dump(NB_with_countvectorizer, f)

### Conclusson:
+ using the data we got from kaggle which contains both fake and real news, we have successfully built a model of 94.43% accuracy and this is done using multinomial Naive Bayes Algorithm.