In [1]:
import pandas as pd

In [2]:
import numpy as np

In [4]:
df = pd.read_csv("clickbait_data.csv")

In [5]:
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding="utf")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [6]:
df

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


In [7]:
data = df["headline"]

In [8]:
labels = df.clickbait

In [9]:
data.shape

(32000,)

In [10]:
0.2*32000

6400.0

In [13]:
32000-3000

29000

In [14]:
x_train = data[0:29000]

In [15]:
x_test = data[29000:]

In [16]:
y_train = labels[0:29000] 

In [17]:
y_test = labels[29000:]

In [18]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train.values)
sequences = tokenizer.texts_to_sequences(x_train.values)
sequences = sequence.pad_sequences(sequences, maxlen=200)

In [20]:
sequences.shape

(29000, 200)

In [21]:
len(tokenizer.word_index)

22735

In [56]:
max_words = 22736
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [52]:
input_layer = Input(shape=(None,), dtype='int32', name='headline_input')
x = layers.Embedding(22736, 100, input_length=200)(input_layer)
x = layers.LSTM(32,
dropout=0.1,
recurrent_dropout=0.5,
return_sequences=True)(x)
x = layers.LSTM(32,
dropout=0.1,
recurrent_dropout=0.5,
return_sequences=False)(x)


In [53]:
x = layers.Dense(100, activation='relu')(x)
output = layers.Dense(1, activation='sigmoid')(x)

In [54]:
model = Model(input_layer,output)

In [55]:
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
headline_input (InputLayer)  [(None, None)]            0         
_________________________________________________________________
embedding_4 (Embedding)      (None, None, 100)         2273600   
_________________________________________________________________
lstm_8 (LSTM)                (None, None, 32)          17024     
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_8 (Dense)              (None, 100)               3300      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 2,302,345
Trainable params: 2,302,345
Non-trainable params: 0
_________________________________________________

In [57]:
model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

In [58]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [59]:
from tensorflow.keras.callbacks import EarlyStopping

In [60]:
es = EarlyStopping(monitor='val_loss', mode='min')

In [61]:
history = model.fit(sequences, y_train.values, epochs=20, validation_split=0.2, callbacks = [es])

Train on 23200 samples, validate on 5800 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20


In [62]:
model.save("clickbait.h5")

In [None]:
sequences = tokenizer.texts_to_sequences(x_test.values)
sequences = sequence.pad_sequences(sequences, maxlen=200)

In [70]:
x_test = sequences

In [73]:
score = model.evaluate(x_test, y_test.values, batch_size=200, verbose=2)

3000/1 - 5s - loss: 0.2057 - acc: 0.9233


In [74]:
score

[0.1826317250728607, 0.92333335]

In [90]:
def encoder(text):
    text = tokenizer.texts_to_sequences([text])
    text = sequence.pad_sequences(text, maxlen=200)
    return text

In [98]:
def predict(text):
    encoded_text = encoder(text)
#     print(encoded_text)
    prediction = (model.predict(encoded_text))
    print(prediction)
    prediction = np.round(prediction)
    if prediction==1:
        return "Clickbait"
    return "Not Clickbait"

In [99]:
predict("Click here")

[[0.57901615]]


'Clickbait'

In [100]:
predict("This is amazing")

[[0.8975421]]


'Clickbait'

In [101]:
predict("Trump finds out")

[[0.41652465]]


'Not Clickbait'

In [102]:
predict("This will make your life easier")

[[0.98155046]]


'Clickbait'

In [103]:
predict("What the heck")

[[0.9191387]]


'Clickbait'

In [105]:
predict("You will never feel the same")

[[0.99736315]]


'Clickbait'

In [None]:
predict("")