# Sarcasm Detection

https://drive.google.com/drive/folders/1xUnF35naPGU63xwRDVGc-DkZ3M8V5mMk

## Install `Tensorflow2.0` 

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
%tensorflow_version 2.x
import tensorflow as tf

## Read Data  from "Sarcasm_Headlines_Dataset.json". Explore the data and get  some insights about the data. 

In [4]:
data = pd.read_json(project_path+'Sarcasm_Headlines_Dataset.json', lines=True)

In [5]:
data.sample(10)

Unnamed: 0,article_link,headline,is_sarcastic
22896,https://www.huffingtonpost.com/entry/khloe-kar...,khloe kardashian channels priscilla presley in...,0
29,https://www.huffingtonpost.com/entry/remembran...,remembrance is the beginning of the task,0
26450,https://www.huffingtonpost.com/entry/stranger-...,filmmaker says 'stranger things' creators stol...,0
26505,https://local.theonion.com/exhausted-florida-r...,exhausted florida resident returns home after ...,1
21345,https://www.theonion.com/2012-marvel-handbook-...,2012 marvel handbook casually reveals peter pa...,1
24133,https://politics.theonion.com/pentagon-report-...,pentagon report concludes too many soldiers ha...,1
4476,https://www.huffingtonpost.com/entry/is-music-...,is music dead? (thoughts on the music industry...,0
12107,https://www.huffingtonpost.com/entry/chris-chr...,chris christie's strange justice,0
2341,https://www.huffingtonpost.com/entry/how-not-t...,how not to look like a tourist in berlin,0
11718,https://www.theonion.com/depressed-crab-stays-...,depressed crab stays buried under sand until 2...,1


In [6]:
data["headline"].value_counts()

the 20 funniest tweets from women this week                                             10
sunday roundup                                                                          10
'no way to prevent this,' says only nation where this regularly happens                  8
the funniest tweets from parents this week                                               6
the funniest tweets from women this week                                                 4
                                                                                        ..
study finds eating doctor after birth can provide essential nutrients to new mothers     1
new report finds moving to isolated seaside cottage greatly increases productivity       1
iraqi forces open fire on protesters storming green zone                                 1
report: revolving door gave goldman access to fed secrets                                1
blackberry still exists, and it's doing alright                                          1

In [7]:
data["is_sarcastic"].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

## Drop `article_link` from dataset

In [8]:
data = data.drop(columns="article_link", axis=1)

In [9]:
data.sample(5)

Unnamed: 0,headline,is_sarcastic
5014,use of organic peanut butter adds two minutes ...,1
482,dentist offers to buy back halloween candy,0
20354,what my parents' divorce taught me about heart...,0
23031,florida man gets arrested with 'go directly to...,0
7657,democrats agree to compromise on superdelegate...,0


## Get the Length of each line and find the maximum length.

In [10]:
line_length = []
for i in range(len(data["headline"])):
  List = (data["headline"].iloc[i].split())
  line_length.append(len(List))
print(line_length)

[12, 14, 14, 13, 11, 4, 7, 14, 7, 9, 10, 8, 12, 10, 8, 8, 16, 7, 9, 3, 7, 16, 11, 9, 13, 7, 8, 4, 7, 7, 8, 10, 12, 9, 8, 8, 5, 15, 11, 10, 10, 15, 8, 8, 8, 6, 15, 11, 7, 4, 4, 11, 9, 13, 14, 11, 11, 9, 10, 12, 4, 5, 6, 13, 12, 9, 12, 17, 7, 7, 8, 9, 9, 8, 13, 11, 9, 10, 5, 8, 10, 8, 10, 13, 10, 11, 14, 11, 17, 13, 4, 19, 12, 8, 11, 9, 10, 10, 11, 9, 7, 12, 9, 10, 6, 9, 7, 10, 6, 11, 12, 12, 7, 22, 9, 11, 10, 19, 12, 13, 8, 3, 10, 10, 9, 9, 11, 8, 4, 14, 11, 8, 6, 11, 8, 7, 11, 9, 11, 9, 11, 11, 8, 6, 10, 8, 10, 9, 11, 15, 9, 8, 6, 11, 7, 9, 8, 11, 7, 18, 9, 7, 9, 14, 14, 13, 9, 15, 11, 8, 8, 10, 14, 9, 9, 6, 12, 5, 10, 7, 13, 14, 8, 7, 10, 9, 15, 10, 11, 11, 11, 12, 6, 8, 9, 12, 6, 8, 12, 16, 7, 14, 12, 9, 7, 13, 6, 7, 12, 9, 10, 6, 13, 15, 6, 22, 9, 11, 15, 9, 12, 10, 12, 10, 5, 8, 10, 9, 12, 5, 9, 8, 10, 11, 7, 9, 12, 12, 9, 8, 6, 14, 10, 11, 6, 7, 5, 13, 5, 8, 14, 9, 8, 9, 4, 5, 17, 13, 10, 7, 10, 12, 11, 8, 4, 7, 8, 10, 7, 11, 12, 12, 8, 16, 7, 8, 6, 8, 4, 9, 8, 6, 14, 8, 11, 14, 8

In [11]:
print("Maximum Length of sentense is ", max(line_length))

Maximum Length of sentense is  39


#**## Modelling**

## Import required modules required for modelling.

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Reshape, InputLayer, LSTM, Embedding, Dropout, Activation, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.backend import clear_session

# Set Different Parameters for the model.

In [13]:
max_features = 1000
maxlen = 39 
embedding_size = 200

## Apply Keras Tokenizer of headline column of your data.

In [14]:
# Pre processing - Tokenization
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data["headline"])

# Define X and y for your model.

In [15]:
X = tokenizer.texts_to_sequences(data['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(data['is_sarcastic'])
print("Number of Samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])

Number of Samples: 26709
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 307 678
  47 381   5]
Number of Labels:  26709
0


## Get the Vocabulary size

In [16]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 29656 unique tokens.


## Get Glove Word Embeddings

In [None]:
glove_file = project_path + "glove.6B.zip"

In [None]:
#Extract Glove embedding zip file
from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
  z.extractall()

# Get the Word Embeddings using Embedding file as given below.

In [17]:
EMBEDDING_FILE = project_path +'/glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

# Create a weight matrix for words in training docs

In [18]:
num_words=len(word_index)+1
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

len(embeddings.values())

400000

## Create and Compile your Model

In [19]:
### Embedding layer for hint 
## model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
### Bidirectional LSTM layer for hint 
## model.add(Bidirectional(LSTM(128, return_sequences = True)))
tf.keras.backend.clear_session()
model = Sequential()
model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix], input_length=200, trainable=True))
# RNN
model.add(Bidirectional(LSTM(128, recurrent_dropout=0.5, dropout=0.5, return_sequences = True)))
# FCNN
model.add(Dense(units=200, activation='relu'))
model.add(Dense(units=1,activation='sigmoid')) # Classification layer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          5931400   
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 256)          336896    
_________________________________________________________________
dense (Dense)                (None, 200, 200)          51400     
_________________________________________________________________
dense_1 (Dense)              (None, 200, 1)            201       
Total params: 6,319,897
Trainable params: 6,319,897
Non-trainable params: 0
_________________________________________________________________


# Fit your model with a batch size of 100 and validation_split = 0.2. and state the validation accuracy

In [21]:
batch_size = 100
epochs = 5

# Model - compiling the model
model.fit(X, y, epochs=5, batch_size=100, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f37561373c8>