In [63]:
import json
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense,Embedding

In [64]:
VOCAB_SIZE = 10000
OOV_TOKEN = '<OOV>'
MAX_LEN = 50
EMB_DIMENSION = 12

In [13]:
df = pd.DataFrame(columns= ['headline', 'is_sarcastic'])
df

Unnamed: 0,headline,is_sarcastic


In [98]:

row_list = []
with open('./app/data/Sarcasm_Headlines_Dataset_v2.json', 'r') as file :
    for jsonText in file.readlines():
        row_list.append(json.loads(jsonText))

In [99]:
df = pd.DataFrame(row_list)

In [28]:
df.drop('article_link', axis=1 , inplace=True)

In [29]:
df

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [38]:
df.loc[:5, ['headline']]

Unnamed: 0,headline
0,thirtysomething scientists unveil doomsday clo...
1,dem rep. totally nails why congress is falling...
2,eat your veggies: 9 deliciously different recipes
3,inclement weather prevents liar from getting t...
4,mother comes pretty close to using word 'strea...
5,my white inheritance


In [48]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)

In [82]:
tokenizer.fit_on_texts(df['headline'])

In [50]:
tokenizer.word_index

{'<OOV>': 1,
 'to': 2,
 'of': 3,
 'the': 4,
 'in': 5,
 'for': 6,
 'a': 7,
 'on': 8,
 'and': 9,
 'with': 10,
 'is': 11,
 'new': 12,
 'trump': 13,
 'man': 14,
 'at': 15,
 'from': 16,
 'about': 17,
 'by': 18,
 'after': 19,
 'you': 20,
 'this': 21,
 'out': 22,
 'up': 23,
 'be': 24,
 'as': 25,
 'that': 26,
 'it': 27,
 'how': 28,
 'not': 29,
 'he': 30,
 'his': 31,
 'are': 32,
 'your': 33,
 'just': 34,
 'what': 35,
 'all': 36,
 'who': 37,
 'has': 38,
 'will': 39,
 'report': 40,
 'into': 41,
 'more': 42,
 'one': 43,
 'have': 44,
 'year': 45,
 'over': 46,
 'why': 47,
 'day': 48,
 'u': 49,
 'area': 50,
 'woman': 51,
 'can': 52,
 's': 53,
 'says': 54,
 'donald': 55,
 'time': 56,
 'first': 57,
 'like': 58,
 'no': 59,
 'her': 60,
 'get': 61,
 'off': 62,
 'old': 63,
 "trump's": 64,
 'life': 65,
 'now': 66,
 'people': 67,
 "'": 68,
 'an': 69,
 'house': 70,
 'still': 71,
 'obama': 72,
 'white': 73,
 'back': 74,
 'make': 75,
 'was': 76,
 'than': 77,
 'women': 78,
 'if': 79,
 'down': 80,
 'when': 81,
 '

In [51]:
sequence =  tokenizer.texts_to_sequences(df['headline'])

In [55]:
padded_sequence = pad_sequences(sequence, maxlen=MAX_LEN
                                )

In [57]:
padded_sequence.shape

(28619, 50)

In [58]:
label = df['is_sarcastic']

In [60]:
label.shape

(28619,)

In [62]:
# split the data


X_train, X_test, y_train, y_test = train_test_split(padded_sequence, label , test_size=0.2)

In [71]:
model = Sequential([
    Embedding(VOCAB_SIZE, EMB_DIMENSION , input_length = MAX_LEN),
    SimpleRNN(128, activation='relu'),
    Dense(1, activation= 'sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [72]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test,y_test))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [101]:
def predict(text):
    sequence =  tokenizer.texts_to_sequences(text)
    print(sequence)
    padded =  pad_sequences(sequence, maxlen=MAX_LEN)
    return model.predict(padded)

In [102]:
predict(['Fantastic idea to wait until the last minute'])

[[8345, 451, 2, 1016, 413, 4, 114, 621]]


array([[0.9999927]], dtype=float32)

In [97]:
model.save('app/model/sarcasm.h5')

  saving_api.save_model(
