In [112]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [28]:
df = pd.read_csv('train.csv', sep = ',', header = 0)

In [29]:
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [30]:
df = df.drop(labels = ['id', 'author', 'text'], axis = 1)

In [31]:
df.isnull().sum().sum()

558

In [36]:
df = df.dropna()
df.reset_index(inplace = True)

In [37]:
df.isnull().sum().sum()

0

In [38]:
X = df['title']
y = df['label']

In [73]:
def text_preprocessing(text):
    corpus = []
    ps = PorterStemmer()
    for i in range(len(text)):
        review = text[i].lower()
        review = re.sub('[^a-zA-Z]', ' ', review)
        review = re.sub('\s+', ' ', review)
        review = review.lower()
        review = nltk.word_tokenize(review, language = 'english')
        review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
        corpus.append(' '.join(review))
    return corpus

corpus = text_preprocessing(X)

In [74]:
X[0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [75]:
corpus[0]

'hous dem aid even see comey letter jason chaffetz tweet'

In [86]:
def word_embedding(corpus, vocabulary_size = 5000, vector_len = 25):
    onehot_rep = [one_hot(input_text = sent, n = vocabulary_size) for sent in corpus]
    embedded = pad_sequences(sequences = onehot_rep, maxlen = vector_len, padding = 'pre')

    return embedded

embedded = word_embedding(corpus)

In [87]:
embedded.shape

(20242, 25)

In [97]:
X_final = np.array(embedded)
y_final = np.array(y)

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, random_state = 0, test_size = 0.2)

In [109]:
vocabulary_size = 5000
vector_len = 25
features_num = 20

model = Sequential()
model.add(Embedding(input_dim = vocabulary_size, output_dim = features_num,
                   input_length = vector_len))
model.add(Dropout(rate = 0.2))
model.add(LSTM(units = 100))
model.add(Dropout(rate = 0.2))
model.add(Dense(units = 1, activation = 'sigmoid'))

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
             loss = 'binary_crossentropy', metrics = ['accuracy'])

In [110]:
model.fit(X_train, y_train, batch_size = 64, epochs = 10, validation_data = (X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ae8ba9b580>

In [115]:
y_pred = model.predict_classes(X_test)



In [117]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2067
           1       0.90      0.92      0.91      1982

    accuracy                           0.91      4049
   macro avg       0.91      0.91      0.91      4049
weighted avg       0.91      0.91      0.91      4049

