In [1]:
import re
import string
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score as acs
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

### Importing Data 

In [2]:
df = pd.read_csv('F:\\Projects\\Spam_Classifier\\Fake_News_Detection/news.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df = df.drop(['Unnamed: 0'],axis=1)

In [5]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
len(df)

6335

### Removing Punctuation using RegEx

In [7]:
pc = string.punctuation
for idx,s in enumerate(df['title']):
    df['title'][idx] = re.sub(f'[{pc}]','',fr"{s}")

In [8]:
for idx,s in enumerate(df['text']):
    df['text'][idx] = re.sub(f'[{pc}]','',fr"{s}")

In [9]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,Daniel Greenfield a Shillman Journalism Fellow...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,US Secretary of State John F Kerry said Monday...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,— Kaydee King KaydeeKing November 9 2016 The l...,FAKE
4,The Battle of New York Why This Primary Matters,Its primary day in New York and frontrunners H...,REAL


### Removing Empty or Null strings

In [10]:
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [11]:
empty = []
for idx,s in enumerate(df['title']):
    if s.isspace():
        empty.append(idx)
empty

[]

In [12]:
empty = []
for idx,s in enumerate(df['text']):
    if s.isspace():
        empty.append(idx)
empty

[106,
 710,
 806,
 919,
 940,
 1664,
 1736,
 1851,
 1883,
 1941,
 2244,
 2426,
 2576,
 2662,
 2788,
 2832,
 3073,
 3350,
 3511,
 3641,
 3642,
 4014,
 4142,
 4253,
 4713,
 4744,
 5017,
 5088,
 5213,
 5581,
 5639,
 5699,
 5772,
 6064,
 6175,
 6328]

In [13]:
for idx in empty:
    df.drop([idx],axis=0,inplace=True)

In [14]:
empty = []
for idx,s in enumerate(df['text']):
    if s.isspace():
        empty.append(idx)
empty

[]

In [15]:
len(df)

6299

### LowerCasing each string --> Not necessary as automatically done during Vectorizing

In [16]:
for idx,s in enumerate(df['title']):
    df['title'][idx] = s.lower()

In [17]:
for idx,s in enumerate(df['text']):
    df['text'][idx] = s.lower()

In [18]:
df.head()

Unnamed: 0,title,text,label
0,you can smell hillary’s fear,daniel greenfield a shillman journalism fellow...,FAKE
1,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,kerry to go to paris in gesture of sympathy,us secretary of state john f kerry said monday...,REAL
3,bernie supporters on twitter erupt in anger ag...,— kaydee king kaydeeking november 9 2016 the l...,FAKE
4,the battle of new york why this primary matters,its primary day in new york and frontrunners h...,REAL


### Separating Labels as y

In [19]:
le = LabelEncoder()
y = le.fit_transform(df['label'])
df = df.drop(['label'],axis=1)

In [20]:
len(y)

6299

In [21]:
df.head()

Unnamed: 0,title,text
0,you can smell hillary’s fear,daniel greenfield a shillman journalism fellow...
1,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...
2,kerry to go to paris in gesture of sympathy,us secretary of state john f kerry said monday...
3,bernie supporters on twitter erupt in anger ag...,— kaydee king kaydeeking november 9 2016 the l...
4,the battle of new york why this primary matters,its primary day in new york and frontrunners h...


### Splitting data into train and test data
#### Separating title and text from x(feed) data

In [22]:
x_train, x_test, y_train, y_test = train_test_split(df,y,test_size=0.2)

In [23]:
x_train_title = x_train['title']
x_test_title = x_test['title']

In [24]:
x_train_text = x_train['text']
x_test_text = x_test['text']

### Creating Pipeline and fitting data

In [25]:
pipe = Pipeline([('tfidf',TfidfVectorizer()),('multinb',MultinomialNB())])

#### On Title 

In [26]:
pipe.fit(x_train_title,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('multinb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [27]:
x_title_predict = pipe.predict(x_test_title)

In [28]:
pipe.score(x_test_title,y_test)

0.8142857142857143

In [29]:
print(acs(x_title_predict,y_test))

0.8142857142857143


#### On Text

In [30]:
pipe.fit(x_train_text,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('multinb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [31]:
x_text_predict = pipe.predict(x_test_text)

In [32]:
pipe.score(x_test_text,y_test)

0.8111111111111111

### Anding Predictions of both title and text data

In [33]:
x_predict = x_title_predict & x_text_predict

In [34]:
x_predict

array([1, 0, 0, ..., 0, 1, 1], dtype=int32)

In [35]:
acs(x_predict,y_test)

0.8539682539682539

## Final Score: 85%