In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout,Embedding,LSTM,Bidirectional,Dense,Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import re
import json
import pandas as pd
from pandas import DataFrame

#Read the json file   
data = []
for line in open('Sarcasm_Headlines_Dataset.json', 'r'):
    data.append(json.loads(line)) 
  
# 1st headline
print(data[0])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}


In [2]:
#convert list to dataframe

df = DataFrame(data,columns=['is_sarcastic','headline','article_link'])
print("shape of data is ",df.shape)
#drop the column article_link
df.drop(columns=['article_link'],inplace=True)
print(df.head(1))

print("columns after dropping article_link",df.columns)

shape of data is  (26709, 3)
   is_sarcastic                                           headline
0             0  former versace store clerk sues over secret 'b...
columns after dropping article_link Index(['is_sarcastic', 'headline'], dtype='object')


In [4]:
#Get length of each headline and add a column for that
df['col_len']=df['headline'].str.len()
#1st 2 elements of new dataframe with added column
print(df.head(3))

   is_sarcastic                                           headline  col_len
0             0  former versace store clerk sues over secret 'b...       78
1             0  the 'roseanne' revival catches up to our thorn...       84
2             1  mom starting to fear son's web series closest ...       79


In [7]:
#Initialize parameter values
max_features = 10000
maxlen = 25
embedding_size = 200

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(num_words=max_features)


tokenizer.fit_on_texts(df.headline)
sequences=tokenizer.texts_to_sequences(df.headline)
#word indices
word_index=tokenizer.word_index
#padding
X=tf.keras.preprocessing.sequence.pad_sequences(sequences,maxlen=maxlen,padding='post')

#convert target column to numpy array
y=df['is_sarcastic']

#num_classes=len(y.values())
y=tf.keras.utils.to_categorical(y,dtype ="uint8")
#print shape
print('Shape of X is',X.shape)
print('Shape of y is',y.shape)

#number of words
num_words = len(word_index) + 1
print("length of vocabulary is ",num_words)

Shape of X is (26709, 25)
Shape of y is (26709, 2)
length of vocabulary is  29657


In [8]:
#Create embedding matrix
EMBEDDING_FILE = 'glove.6B.200d.txt'

embedding_dim = 200


embeddings = {}
for o in open(EMBEDDING_FILE, encoding="utf-8", mode="r"):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, 200))




for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector
print(embedding_vector.shape)

(200,)


In [9]:
#Define model
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

model=tf.keras.Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.3 , dropout = 0.3,return_sequences = True)))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary() 

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 200)           5931400   
_________________________________________________________________
bidirectional (Bidirectional (No

In [10]:
    
#fit  the model
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3 , random_state = 0) 
history = model.fit(x_train, y_train, batch_size = 128 , validation_data = (x_test,y_test) , epochs = 3)

scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 18696 samples, validate on 8013 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 83.93%
