In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("data/fake_news/train.csv", index_col="id")

In [3]:
df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.shape

(20800, 4)

In [5]:
df.dropna(subset = ["title", "label"], inplace=True)

In [6]:
df.shape

(20242, 4)

In [7]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [8]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13562, 3)
(6680, 3)
(13562,)
(6680,)


In [10]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
stemmer = PorterStemmer()

In [12]:
X_train.reset_index(drop=True, inplace=True)

In [13]:
corpus_train = []
for i in range(0, len(X_train)):
    title = re.sub('[^a-zA-Z]', ' ', X_train.loc[i, 'title'])
    title = title.lower()
    title = title.split()
    title = [stemmer.stem(word) for word in title if not word in stopwords.words('english')]
    title = ' '.join(title)
    corpus_train.append(title)

In [14]:
len(corpus_train)

13562

In [15]:
## Applying Countvectorizer
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000,ngram_range=(1,3))

In [16]:
X_train = cv.fit_transform(corpus_train).toarray()
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X_train.shape

(13562, 10000)

In [18]:
cv.get_feature_names()[0:20]

['aaron',
 'ab',
 'abandon',
 'abandon new',
 'abba',
 'abc',
 'abc news',
 'abduct',
 'abduct ring',
 'abduct ring expos',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abort law',
 'abort law new',
 'abort new',
 'abort new york',
 'abram',
 'abroad']

In [19]:
pd.DataFrame(X_train, columns=cv.get_feature_names())

Unnamed: 0,aaron,ab,abandon,abandon new,abba,abc,abc news,abduct,abduct ring,abduct ring expos,...,zone,zone declar,zone declar militar,zone new,zone new york,zoo,zu,zuckerberg,zuess,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13558,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13560,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [21]:
y_train.reset_index(drop=True, inplace=True)

In [22]:
nb.fit(X_train, y_train)

MultinomialNB()

In [23]:
X_test.reset_index(drop=True, inplace=True)

In [24]:
corpus_test = []
for i in range(0, len(X_test)):
    title = re.sub('[^a-zA-Z]', ' ', X_test.loc[i, 'title'])
    title = title.lower()
    title = title.split()
    title = [stemmer.stem(word) for word in title if not word in stopwords.words('english')]
    title = ' '.join(title)
    corpus_test.append(title)

In [25]:
X_test = cv.transform(corpus_test).toarray()
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
pd.DataFrame(X_test, columns=cv.get_feature_names())

Unnamed: 0,aaron,ab,abandon,abandon new,abba,abc,abc news,abduct,abduct ring,abduct ring expos,...,zone,zone declar,zone declar militar,zone new,zone new york,zoo,zu,zuckerberg,zuess,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6676,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6678,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
pred = nb.predict(X_test)

In [28]:
y_test.reset_index(drop=True, inplace=True)

In [29]:
# Import libraries to check performance
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [30]:
accuracy_score(y_test, pred)

0.9032934131736527

In [31]:
nb.classes_

array([0, 1], dtype=int64)

In [32]:
confusion_matrix(y_test, pred)

array([[3156,  288],
       [ 358, 2878]], dtype=int64)

In [33]:
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n           0       0.90      0.92      0.91      3444\n           1       0.91      0.89      0.90      3236\n\n    accuracy                           0.90      6680\n   macro avg       0.90      0.90      0.90      6680\nweighted avg       0.90      0.90      0.90      6680\n'

In [34]:
# Get Features names
feature_names = cv.get_feature_names()

In [39]:
nb.coef_[0]

array([-11.12552618,  -9.73923182,  -9.33376671, ..., -10.02691389,
        -9.73923182,  -9.73923182])

In [40]:
# Most real
sorted(zip(nb.coef_[0], feature_names), reverse=True)[:20]

[(-4.231869828071055, 'trump'),
 (-4.554643220334106, 'hillari'),
 (-4.631772342822004, 'clinton'),
 (-5.094840922412427, 'elect'),
 (-5.469534371853838, 'new'),
 (-5.473037002405039, 'us'),
 (-5.5533721504959255, 'video'),
 (-5.628357957380488, 'hillari clinton'),
 (-5.632464739333142, 'war'),
 (-5.640729249183035, 'comment'),
 (-5.653255509002215, 'russia'),
 (-5.66169437764808, 'news'),
 (-5.687446873750495, 'fbi'),
 (-5.832221357949198, 'vote'),
 (-5.842322453935702, 'america'),
 (-5.852526624109943, 'email'),
 (-5.878502110513204, 'donald'),
 (-5.889084219843741, 'obama'),
 (-5.927029151407864, 'donald trump'),
 (-5.93256933178348, 'world')]

In [43]:
# Most fake
sorted(zip(nb.coef_[0], feature_names))[:20]

[(-11.12552618267369, 'aaron'),
 (-11.12552618267369, 'abandon new'),
 (-11.12552618267369, 'abba'),
 (-11.12552618267369, 'abort law'),
 (-11.12552618267369, 'abort law new'),
 (-11.12552618267369, 'abort new'),
 (-11.12552618267369, 'abort new york'),
 (-11.12552618267369, 'abroad'),
 (-11.12552618267369, 'abroad new'),
 (-11.12552618267369, 'abroad new york'),
 (-11.12552618267369, 'abus new'),
 (-11.12552618267369, 'abus new york'),
 (-11.12552618267369, 'academ'),
 (-11.12552618267369, 'accomplic'),
 (-11.12552618267369, 'accus rape'),
 (-11.12552618267369, 'acquisit'),
 (-11.12552618267369, 'act breitbart'),
 (-11.12552618267369, 'act new'),
 (-11.12552618267369, 'act new york'),
 (-11.12552618267369, 'ad new')]