In [5]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from tensorflow.keras.layers import Embedding,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Bidirectional,GRU
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

[nltk_data] Downloading package stopwords to /Users/Umang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
train = pd.read_csv("data/fake-news/train.csv")
test  = pd.read_csv("data/fake-news/test.csv")
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
print("Shape of training data {}".format(train.shape))
print("Shape of test data {}".format(test.shape))


Shape of training data (20800, 5)
Shape of test data (5200, 4)


### Looking at the count of nulls in the data and filling them with blank spaces, this is done to make sure the later steps of Data pre-processing and model don't break

In [8]:
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
train = train.fillna(" ")
test  = test.fillna(" ")

### We create a synthetic feature called merged, by combining the title and author features together, this is done for 2 purposes:
1. Classification of the news depends on both author and the content, hence a single feature would be better for learning 
2. The algorithm would only have a single feature to learn on.
 

In [10]:
train["merged"] = train["title"]+" "+train["author"]
test["merged"]  = test["title"]+" "+test["author"]

In [11]:
X = train.drop(columns=['label'],axis=1)
y = train['label']

In [12]:
# Creating copy of the train and test dataframes for data pre-processing
messages = X.copy()
messages.reset_index(inplace=True)
messages_test = test.copy()
messages_test.reset_index(inplace=True)

### Data Pre-Processing.We perform the following pre-processing steps:
    1. All the punctuations/sequences are removed, they don't really help in the model learning
    2. To prevent confusion and to ease the 1-hot encoding process, we convert everything to lower case
    3. We then make a continous stream of tokens rather than sentences
    4. Then we do stemming which is a common NLP pre-procesing step, it reduces the word to its root word for e.g chocolaty becomes chocolate
    5. Lastly we create a 1-hot representaion of the text present, ML algorithms only understand numbers, hence we need to convert the text into numbers and this is one of the most simplest and efficient conversion.
    

In [13]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def perform_preprocess(data):
    '''Input: Data to be processed
       Output: Preprocessed data
    '''
    corpus = []
    for i in range(0,len(data)):
        review = re.sub('[^a-zA-Z]',' ',data['merged'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus
    
train_corpus = perform_preprocess(messages)
test_corpus  = perform_preprocess(messages_test)
train_corpus[1]

'flynn hillari clinton big woman campu breitbart daniel j flynn'

In [14]:
# Converting to one-hot repr.
vocab_size = 5000
one_hot_train = [one_hot(word,vocab_size) for word in train_corpus]
one_hot_test  = [one_hot(word,vocab_size) for word in test_corpus]

### Padding is an important step before we proceed to make models. padding makes the data uniform by making sure each feature vector is of same length by padding zeros. This becomes crucial because most algorithms(e.g neural networks) require the input data to be uniform

In [15]:
sent_length = 20
embedd_docs_train = pad_sequences(one_hot_train,padding='pre',maxlen=sent_length)
embedd_docs_test  = pad_sequences(one_hot_test,padding='pre',maxlen=sent_length)

In [16]:
## Converting the final dataset to numpy arrays as most algorithm implementations need np arrays as inputs
final_X = np.array(embedd_docs_train)
final_y = np.array(y)
x_test_final = np.array(embedd_docs_test)

In [24]:
final_X.shape,final_y.shape,x_test_final.shape

((20800, 20), (20800,), (5200, 20))

#### Train, test, validation split with 80.10,10 proportion for each

In [27]:

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(final_X, final_y, test_size=0.1, random_state=42, stratify = final_y)
X_train, x_valid, Y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42, stratify = y_train)
x_test_final = x_test_final

### We first try a boosted Random forest classifier called XGBoost, this ML algorithm uses the power of boosting and delivers great results with very high speed

In [32]:
xgb= XGBClassifier()
xgb.fit(X_train,Y_train)
pred_xgb = xgb.predict(x_test)
XGBmetrics    = classification_report(y_test,pred_xgb_xgb)
print(XGBmetrics)



              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1039
           1       0.98      0.99      0.98      1041

    accuracy                           0.98      2080
   macro avg       0.98      0.98      0.98      2080
weighted avg       0.98      0.98      0.98      2080



### Secondly we try a Deep learning model as well, the thought behind is that in such text classification tasks, LSTMs have worked very well in the past, also we use an Embedding layer as the starting layer of our neural net. This creates a word embedding of the text input and has proven to work very well.

In [33]:
embedding_feature_vector = 40
model = Sequential()
model.add(Embedding(vocab_size,embedding_feature_vector,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
model.fit(X_train,Y_train,validation_data=(x_valid,y_valid),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1409a78d0>

In [35]:
predictions = model.predict_classes(x_test)
DLmetrics = classification_report(y_test,predictions)
print(DLmetrics)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1039
           1       0.99      0.98      0.99      1041

    accuracy                           0.99      2080
   macro avg       0.99      0.99      0.99      2080
weighted avg       0.99      0.99      0.99      2080



## Comparing the metrics of both ML and DL approches used above, one can see that LSTM based Deep learning model outperforms the XGBoost classifier. Hence we move ahead and use LSTM model for submission purposes

In [36]:
predictions_test = pd.DataFrame(model.predict_classes(x_test_final))
test_id = pd.DataFrame(test["id"])
submission = pd.concat([test_id,predictions_test],axis=1)
submission.columns = ["id","label"]
submission.to_csv("Submission.csv",index=False)