In [130]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import load_model
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten
from sklearn.metrics import confusion_matrix,classification_report

In [131]:
df1 = pd.read_csv('clickbait_data_1.txt',sep='\x01',header=None)
df2 = pd.read_csv('non_clickbait_data_1.txt',sep='\x01',header=None)

In [132]:
# Now we will add new column to dataframe wikth value = 0. It will act as class variable for clickbait data
df1['Class']=1
df2['Class']=0


df1['length']= df1[0].apply(len)
df1=df1.rename(index=str,columns={0:'Text'})

df2=df2.rename(index=str,columns={0:'Text'})
df2['length']= df2['Text'].apply(len)


In [133]:
df3 = pd.concat([df1,df2],ignore_index= True)

In [134]:
from keras.preprocessing.text import Tokenizer
vocabulary_size = 8000
tokenizer = Tokenizer(num_words= vocabulary_size)

In [135]:
tokenizer.fit_on_texts(df3['Text'])

In [136]:
sequences = tokenizer.texts_to_sequences(df3['Text'])

In [137]:
sequences

[[43, 92, 93, 4700],
 [31, 119, 2302, 946, 182, 35, 5, 537, 1],
 [3, 17, 120, 405, 3, 689, 1232, 753, 12, 277, 2, 278, 5, 4701],
 [13, 1480, 4, 17, 352, 8, 296, 279, 1790, 12, 844, 263],
 [6,
  690,
  353,
  6,
  1077,
  354,
  1481,
  14,
  40,
  434,
  29,
  4702,
  381,
  136,
  52,
  4703,
  1791,
  4704],
 [20, 2, 4705, 14, 1482, 251, 435, 635, 6, 2303, 4706, 4, 691],
 [947, 436, 28, 2, 4707, 18, 10, 845, 406],
 [71, 226, 1792, 91, 18, 578],
 [297, 6, 2304, 55, 3056, 15, 60, 3057, 10, 692],
 [2305,
  2306,
  9,
  4708,
  4709,
  579,
  6,
  87,
  2,
  183,
  20,
  4710,
  45,
  207,
  1,
  3,
  1233,
  407],
 [98, 263, 3058, 2, 3, 316, 3059],
 [44, 53, 92, 1234, 126, 3, 3060, 355, 1078, 214, 3061],
 [39, 53, 4711, 846, 636, 143, 95, 693, 408],
 [948, 1483, 37, 5, 4712],
 [23, 12, 10, 296, 538, 46, 8, 10, 121],
 [39, 3062, 332, 83, 409, 215, 1079, 2, 32, 539],
 [11, 5, 57, 4713, 494, 55, 4714],
 [3, 67, 165, 3063, 84, 1080, 61, 437, 2, 4715, 54, 61, 3064, 1, 6, 2307, 465],
 [72, 59

In [138]:
mxlen =40
from keras.preprocessing.sequence import pad_sequences
data = pad_sequences(sequences, maxlen=mxlen,padding = 'post',truncating='post')

In [139]:
data

array([[  43,   92,   93, ...,    0,    0,    0],
       [  31,  119, 2302, ...,    0,    0,    0],
       [   3,   17,  120, ...,    0,    0,    0],
       ...,
       [2701,   16,   16, ...,    0,    0,    0],
       [ 932, 2719,  347, ...,    0,    0,    0],
       [ 113, 1054,  569, ...,    0,    0,    0]])

In [140]:
matrix_df_op = df3.iloc[:,1].values

In [141]:
X_train,X_test,y_train,y_test = train_test_split(data,matrix_df_op, test_size =0.25, stratify=matrix_df_op)
sc= StandardScaler()
X_train = sc.fit_transform(X_train.astype(float))
X_test = sc.fit_transform(X_test.astype(float))

In [142]:
X_train[1]

array([-0.17356328, -0.55805544, -0.49604957, -0.64799065, -0.52917041,
       -0.45950033, -0.35892215, -0.47713407, -0.4063556 , -0.35020012,
       -0.280458  , -0.22543308, -0.16984966, -0.12524899, -0.0911228 ,
       -0.0762653 , -0.05671124, -0.04394857, -0.02555322, -0.01752284,
       -0.01708441,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [143]:
X_test.shape

(1415, 40)

In [144]:
y_train.shape

(4243,)

In [145]:
y_test.shape

(1415,)

In [146]:
##Working with 75% accuracy##
LSTMmodel = Sequential()
LSTMmodel.add(Embedding(vocabulary_size, 200, input_length=mxlen))
LSTMmodel.add(LSTM(100, dropout=0.2,return_sequences=True))
LSTMmodel.add(LSTM(50, dropout=0.2,return_sequences=True))
LSTMmodel.add(LSTM(24, dropout=0.2,return_sequences=True))
LSTMmodel.add(LSTM(10, dropout=0.2,return_sequences=True))
LSTMmodel.add(Flatten())
LSTMmodel.add(Dense(1, activation='sigmoid'))
LSTMmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
LSTMmodel.save('LSTMmodel.h5')
##End##

##Working with 75% accuracy##
ANNmodel = Sequential()
ANNmodel.add(Embedding(vocabulary_size, 100, input_length=mxlen))
ANNmodel.add(Flatten())
ANNmodel.add(Dense(activation="relu", input_dim=100, units=50, kernel_initializer="uniform"))
ANNmodel.add(Dense(activation="relu", units=24, kernel_initializer="uniform"))
ANNmodel.add(Dense(activation="relu", units=8, kernel_initializer="uniform"))
ANNmodel.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))
ANNmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
##End##

In [147]:
#model.fit(data, np.array(labels), validation_split=0.4, epochs=3)
LSTMmodel.fit(X_train,y_train,batch_size=50,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f669b1d400>

In [148]:
LSTMmodel.save('LSTMmodel.h5')

In [149]:
ANNmodel.fit(X_train,y_train,batch_size=100,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f65b29ecf8>

In [150]:
ANNmodel.save('ANNmodel.h5')

In [151]:
LSTMmodel = load_model('LSTMmodel.h5')
y_pred = LSTMmodel.predict(X_test)

y_pred = y_pred >0.5

cm_LSTM= confusion_matrix(y_test,y_pred)

In [152]:
print(classification_report(y_test,y_pred))
cm_LSTM

             precision    recall  f1-score   support

          0       0.69      0.88      0.77       790
          1       0.76      0.50      0.61       625

avg / total       0.72      0.71      0.70      1415



array([[692,  98],
       [310, 315]], dtype=int64)

In [153]:
ANNmodel = load_model('ANNmodel.h5')
y_pred = ANNmodel.predict(X_test)

y_pred = y_pred >0.5

cm_ANN= confusion_matrix(y_test,y_pred)

In [154]:
print(classification_report(y_test,y_pred))
cm_ANN

             precision    recall  f1-score   support

          0       0.69      0.92      0.79       790
          1       0.83      0.49      0.61       625

avg / total       0.75      0.73      0.71      1415



array([[728,  62],
       [320, 305]], dtype=int64)