In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("data/fake_news/train.csv", index_col="id")

In [3]:
df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.shape

(20800, 4)

In [5]:
df.dropna(subset = ["text", "label"], inplace=True)

In [6]:
df.shape

(20761, 4)

In [7]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [8]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13909, 3)
(6852, 3)
(13909,)
(6852,)


In [10]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
stemmer = PorterStemmer()

In [12]:
X_train.reset_index(drop=True, inplace=True)

In [13]:
corpus_train = []
for i in range(0, len(X_train)):
    title = re.sub('[^a-zA-Z]', ' ', X_train.loc[i, 'text'])
    title = title.lower()
    title = title.split()
    title = [stemmer.stem(word) for word in title if not word in stopwords.words('english')]
    title = ' '.join(title)
    corpus_train.append(title)

In [14]:
len(corpus_train)

13909

In [15]:
## Applying TFIDFVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000,ngram_range=(1,3))

In [17]:
X_train = tfidf.fit_transform(corpus_train).toarray()
X_train

array([[0.08723604, 0.10332821, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [18]:
X_train.shape

(13909, 10000)

In [20]:
tfidf.get_feature_names()[0:20]

['aaron',
 'aaron klein',
 'abandon',
 'abba',
 'abbott',
 'abc',
 'abc news',
 'abdel',
 'abduct',
 'abdullah',
 'abe',
 'abedin',
 'abid',
 'abil',
 'abl',
 'abl get',
 'abnorm',
 'aboard',
 'abolish',
 'aborigin']

In [22]:
pd.DataFrame(X_train, columns=tfidf.get_feature_names())

Unnamed: 0,aaron,aaron klein,abandon,abba,abbott,abc,abc news,abdel,abduct,abdullah,...,zealand,zero,zero hedg,zika,zika viru,zionist,zone,zoo,zu,zuckerberg
0,0.087236,0.103328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13904,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
13905,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
13906,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
13907,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [23]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [24]:
y_train.reset_index(drop=True, inplace=True)

In [25]:
nb.fit(X_train, y_train)

MultinomialNB()

In [26]:
X_test.reset_index(drop=True, inplace=True)

In [27]:
corpus_test = []
for i in range(0, len(X_test)):
    title = re.sub('[^a-zA-Z]', ' ', X_test.loc[i, 'text'])
    title = title.lower()
    title = title.split()
    title = [stemmer.stem(word) for word in title if not word in stopwords.words('english')]
    title = ' '.join(title)
    corpus_test.append(title)

In [28]:
X_test = tfidf.transform(corpus_test).toarray()
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
pd.DataFrame(X_test, columns=tfidf.get_feature_names())

Unnamed: 0,aaron,aaron klein,abandon,abba,abbott,abc,abc news,abdel,abduct,abdullah,...,zealand,zero,zero hedg,zika,zika viru,zionist,zone,zoo,zu,zuckerberg
0,0.0,0.0,0.0,0.089667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6847,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6848,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6849,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6850,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
pred = nb.predict(X_test)

In [31]:
y_test.reset_index(drop=True, inplace=True)

In [32]:
# Import libraries to check performance
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [33]:
accuracy_score(y_test, pred)

0.9153531815528313

In [34]:
nb.classes_

array([0, 1], dtype=int64)

In [35]:
confusion_matrix(y_test, pred)

array([[3220,  238],
       [ 342, 3052]], dtype=int64)

In [36]:
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n           0       0.90      0.93      0.92      3458\n           1       0.93      0.90      0.91      3394\n\n    accuracy                           0.92      6852\n   macro avg       0.92      0.92      0.92      6852\nweighted avg       0.92      0.92      0.92      6852\n'

In [37]:
# Get Features names
feature_names = tfidf.get_feature_names()

In [39]:
nb.coef_[0]

array([ -9.74691091, -10.63959578,  -9.19452966, ..., -10.32251355,
        -8.8828781 , -10.145091  ])

In [40]:
# Most real
sorted(zip(nb.coef_[0], feature_names), reverse=True)[:20]

[(-5.638049973849305, 'clinton'),
 (-5.733712511104222, 'trump'),
 (-5.941180330283757, 'hillari'),
 (-6.138467668604752, 'elect'),
 (-6.166970478553644, 'us'),
 (-6.33085227977948, 'peopl'),
 (-6.372002543622038, 'state'),
 (-6.404457515228456, 'vote'),
 (-6.427236649956161, 'one'),
 (-6.436742701828359, 'email'),
 (-6.532981168372791, 'would'),
 (-6.5347756792837055, 'fbi'),
 (-6.543461069714046, 'like'),
 (-6.5620946911778635, 'time'),
 (-6.574940450262842, 'american'),
 (-6.577072128497617, 'hillari clinton'),
 (-6.621638799252267, 'presid'),
 (-6.626728879907309, 'russia'),
 (-6.630169210113875, 'world'),
 (-6.630432331521318, 'war')]

In [41]:
# Most fake
sorted(zip(nb.coef_[0], feature_names))[:20]

[(-11.231284324292918, 'advis mr'),
 (-11.231284324292918, 'airbag'),
 (-11.231284324292918, 'antidop'),
 (-11.231284324292918, 'attorney gener jeff'),
 (-11.231284324292918, 'author igcolonel'),
 (-11.231284324292918, 'author igcolonel hotmail'),
 (-11.231284324292918, 'automak'),
 (-11.231284324292918, 'awr'),
 (-11.231284324292918, 'awr hawkin'),
 (-11.231284324292918, 'awrhawkin'),
 (-11.231284324292918, 'back last night'),
 (-11.231284324292918, 'ben kew'),
 (-11.231284324292918, 'breitbart tech'),
 (-11.231284324292918, 'breitbart texa team'),
 (-11.231284324292918, 'brief email'),
 (-11.231284324292918, 'brief email good'),
 (-11.231284324292918, 'brief like'),
 (-11.231284324292918, 'brief like want'),
 (-11.231284324292918, 'brief nytim'),
 (-11.231284324292918, 'brief nytim com')]