**PROJECT OBJECTIVE**: Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments.

### 1. Read and explore the data

In [4]:
import json
import pandas as pd
import numpy as np

In [32]:
# reading JSON file
df = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)
# displaying sample output
df.head(5)

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


- There are no null values and the relevant columns are "headline" and "is_sarcastic"

### 2. Retain relevant columns

In [33]:
# Keeping only the relevant columns "headline" and "is_sarcastic"
df.drop(['article_link'], axis =1, inplace= True)
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


### 3. Get length for each sentence

In [None]:
# Length of each character in a sentence including spaces between words
df['headline'].apply(len)

0        78
1        84
2        79
3        84
4        64
         ..
26704    36
26705    23
26706    21
26707    60
26708    33
Name: headline, Length: 26709, dtype: int64

In [None]:
# Length of only words in a sentence excluding spaces
df['len'] = df['headline'].str.split(" ").str.len()
df['len']

0        12
1        14
2        14
3        13
4        11
         ..
26704     5
26705     4
26706     3
26707     8
26708     6
Name: len, Length: 26709, dtype: int64

### 4. Define parameters

In [34]:
max_features = 30000
maxlen = 30
embedding_size = 50

### 5. Get indices for words

In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df['headline']))
tokenizer.word_index

{'to': 1,
 'of': 2,
 'the': 3,
 'in': 4,
 'for': 5,
 'a': 6,
 'on': 7,
 'and': 8,
 'with': 9,
 'is': 10,
 'new': 11,
 'trump': 12,
 'man': 13,
 'from': 14,
 'at': 15,
 'about': 16,
 'you': 17,
 'this': 18,
 'by': 19,
 'after': 20,
 'up': 21,
 'out': 22,
 'be': 23,
 'how': 24,
 'as': 25,
 'it': 26,
 'that': 27,
 'not': 28,
 'are': 29,
 'your': 30,
 'his': 31,
 'what': 32,
 'he': 33,
 'all': 34,
 'just': 35,
 'who': 36,
 'has': 37,
 'will': 38,
 'more': 39,
 'one': 40,
 'into': 41,
 'report': 42,
 'year': 43,
 'why': 44,
 'have': 45,
 'area': 46,
 'over': 47,
 'donald': 48,
 'u': 49,
 'day': 50,
 'says': 51,
 's': 52,
 'can': 53,
 'first': 54,
 'woman': 55,
 'time': 56,
 'like': 57,
 'her': 58,
 "trump's": 59,
 'old': 60,
 'no': 61,
 'get': 62,
 'off': 63,
 'an': 64,
 'life': 65,
 'people': 66,
 'obama': 67,
 'now': 68,
 'house': 69,
 'still': 70,
 "'": 71,
 'women': 72,
 'make': 73,
 'was': 74,
 'than': 75,
 'white': 76,
 'back': 77,
 'my': 78,
 'i': 79,
 'clinton': 80,
 'down': 81,
 'i

In [36]:
#text_to_sequence converts the generated tokens in the sequence as per the sentence that we had feed to the tokenizer class.
X = tokenizer.texts_to_sequences(df['headline'])

print("Number of Samples:", len(X))       
print(X[0]) #tokens of the first sentence in the sequence of the words

Number of Samples: 26709
[307, 15114, 678, 3336, 2297, 47, 381, 2575, 15115, 5, 2576, 8433]


### 6. Create features and labels

In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Padding is added to each sequence to have equal length of sentences
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(df['is_sarcastic'])

print("Shape of input X:", X.shape)
print("Number of Labels: ", len(y))   
print(y)

Shape of input X: (26709, 30)
Number of Labels:  26709
[0 0 1 ... 0 0 0]


### 7. Get vocabulary size

In [38]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

29657

### 8. Create a weight matrix using GloVe embeddings

In [15]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-11-11 15:18:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-11 15:18:23--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-11 15:18:23--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [39]:
EMBEDDING_FILE = 'glove.6B.50d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE,  'r', encoding='utf-8'):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

In [40]:
# Create Weight Matrix
embedding_matrix = np.zeros((vocab_size, 50))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [18]:
print (len(embeddings))

400000


In [21]:
print(embedding_matrix.shape)

(29657, 50)


In [41]:
from keras.layers import Embedding

embedding_layer = Embedding(vocab_size,
                            50,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

### 9. Define and compile a Bidirectional LSTM model.

In [42]:
#Dividing training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50, stratify=y, test_size=0.1)

In [43]:
print(X_train.shape, X_test.shape)

(24038, 30) (2671, 30)


In [44]:
#Defining a Bidirectional LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Flatten, Dense, TimeDistributed
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Activation, Bidirectional, LSTM, Dense, Dropout, Flatten, Input
from functools import partial
from tensorflow.keras.layers import BatchNormalization
from keras.models import Model

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(64, activation = 'relu', return_sequences=True)))

model.add(TimeDistributed(Dense(100)))

model.add(Flatten())
model.add(Dense(1024, activation = 'relu'))
model.add(Dense(512, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))

In [45]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [46]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            1482850   
                                                                 
 bidirectional_1 (Bidirectio  (None, 30, 128)          58880     
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 30, 100)          12900     
 tributed)                                                       
                                                                 
 flatten_1 (Flatten)         (None, 3000)              0         
                                                                 
 dense_6 (Dense)             (None, 1024)              3073024   
                                                                 
 dense_7 (Dense)             (None, 512)              

### 10. Fit the model and check the validation accuracy

In [47]:
#Training the model
model.fit(X_train, y_train, validation_split=0.1, epochs=4, batch_size=1000)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f6fdaa844d0>

In [48]:
pred = model.predict(X_test[0].reshape((1, 30)))



In [49]:
y_test[0] #Actual output

1

In [50]:
#Predicted value
print(pred[0]) # Sigmoid value 

[0.9373475]


In [51]:
if pred[0] >= 0.5:
  print('Sarcastic')
else:
  print('Not Sarcastic')

Sarcastic


In [52]:
# Check the validation accuracy
score = model.evaluate(X_test, y_test)
print("Loss: {}, Accuracy:{}".format(score[0], score[1]))

Loss: 0.395680695772171, Accuracy:0.8274054527282715


The model gives an accuracy of 0.8274