In [1]:
import json
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data=pd.read_json("Sarcasm_Headlines_dataset.json",lines=True)

In [4]:
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
data.drop(['article_link'],axis=1)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [6]:
sentence=data['headline'].tolist()

In [7]:
label=data['is_sarcastic'].tolist()

In [8]:
sentence[:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [9]:
nltk.download('punkt')
stop_words=set(stopwords.words("english"))

#Function to remove stopwords from a single sentence
def remove_stopwords(sentence):
    words=nltk.word_tokenize(sentence)
    filtered_words=[word for word in words if word.lower()not in stop_words]
    return"".join(filtered_words)

#Remove stopwords from each sentence in the list
sentence=[remove_stopwords(sentence) for sentence in sentence]

#Print the sentences without stopwords
sentence[:10]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


["formerversacestoreclerksuessecret'blackcode'minorityshoppers",
 "'roseanne'revivalcatchesthornypoliticalmood,betterworse",
 "momstartingfearson'swebseriesclosestthinggrandchild",
 'boehnerwantswifelisten,comealternativedebt-reductionideas',
 'j.k.rowlingwishessnapehappybirthdaymagicalway',
 "advancingworld'swomen",
 'fascinatingcaseeatinglab-grownmeat',
 'ceosendkidsschool,workcompany',
 'topsnakehandlerleavessinkinghuckabeecampaign',
 "friday'smorningemail:insidetrump'spresserages"]

In [10]:
import string

nltk.download('punkt')

# Define a set of punctuation characters
punctuations = set(string.punctuation)

# Function to remove punctuation from a single sentence
def remove_punctuation(sentence):
    words = nltk.word_tokenize(sentence)
    filtered_words = [word for word in words if word.lower() not in punctuations]
    return " ".join(filtered_words)

#Remove punctuations from each sentence in the list
sentence=[remove_punctuation(sentence) for sentence in sentence]

#Print the sentences without punctuations
sentence[:10]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


["formerversacestoreclerksuessecret'blackcode'minorityshoppers",
 "'roseanne'revivalcatchesthornypoliticalmood betterworse",
 "momstartingfearson'swebseriesclosestthinggrandchild",
 'boehnerwantswifelisten comealternativedebt-reductionideas',
 'j.k.rowlingwishessnapehappybirthdaymagicalway',
 "advancingworld'swomen",
 'fascinatingcaseeatinglab-grownmeat',
 'ceosendkidsschool workcompany',
 'topsnakehandlerleavessinkinghuckabeecampaign',
 "friday'smorningemail insidetrump'spresserages"]

In [11]:
train_size=round(len(sentence)*0.75)
train_sen=sentence[0:train_size]
test_sen=sentence[train_size:]
train_label=label[0:train_size]
test_label=label[train_size:]

In [12]:
vocab_size=10000
oov_tok="oov"

In [13]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(train_sen)
word_index=tokenizer.word_index

In [14]:
word_index

{'oov': 1,
 's': 2,
 'report': 3,
 'year': 4,
 'u': 5,
 "''": 6,
 'study': 7,
 "'s": 8,
 'a': 9,
 'k': 10,
 'watch': 11,
 'c': 12,
 'to': 13,
 '000': 14,
 'video': 15,
 'historicalarchives': 16,
 'j': 17,
 'trump': 18,
 'breaking': 19,
 'of': 20,
 'huffpollster': 21,
 '5': 22,
 'poll': 23,
 'n': 24,
 'huffpostrise': 25,
 '11': 26,
 "n't": 27,
 '1': 28,
 'in': 29,
 '2': 30,
 'd': 31,
 'the': 32,
 'ex': 33,
 'm': 34,
 '9': 35,
 'anti': 36,
 'yes': 37,
 'obama': 38,
 'policesay': 39,
 '7': 40,
 't': 41,
 'update': 42,
 'self': 43,
 'on': 44,
 '20': 45,
 '10': 46,
 '3': 47,
 '2017': 48,
 'and': 49,
 'huffposthill': 50,
 'f': 51,
 '8': 52,
 "friday'smorningemail": 53,
 'look': 54,
 'old': 55,
 'police': 56,
 'sundayroundup': 57,
 "thursday'smorningemail": 58,
 '4': 59,
 'watchlive': 60,
 'real': 61,
 'middle': 62,
 'non': 63,
 '20funniesttweetswomenweek': 64,
 'i': 65,
 '6': 66,
 'first': 67,
 'for': 68,
 'all': 69,
 'time': 70,
 'out': 71,
 'e': 72,
 'l': 73,
 'by': 74,
 'well': 75,
 'sen'

# Padding

In [15]:
max_length=100 #max length of a sentence can be 100,if not specified,
#then the length of the longest sentence is sent 
trunc_type='post'
padding_type='post'

In [16]:
training_sequences=tokenizer.texts_to_sequences(train_sen)
#assigning sequences,numbers to tokens as model can only train on numbers
training_padded=pad_sequences(training_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)
#in any raw text data,naturally there will be sentences of different lengths,all nn require the same input
testing_sequences=tokenizer.texts_to_sequences(test_sen)

testing_padded=pad_sequences(testing_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)

In [17]:
embedding_dim=16
model=tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),

    tf.keras.layers.GlobalAveragePooling1D(),

    tf.keras.layers.Dense(24,activation='relu'),

    tf.keras.layers.Dense(1,activation='sigmoid')])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
training_padded=np.array(training_padded)
training_labels=np.array(train_label)
testing_padded=np.array(testing_padded)
testing_labels=np.array(test_label)
#converting the list to array for tensorflow

In [19]:
#training the model ... finally ghar ja sakte hai

num_epochs=30
history=model.fit(training_padded,training_labels,epochs=num_epochs,validation_data=(testing_padded,testing_labels),verbose=2)

Epoch 1/30
626/626 - 3s - loss: 0.6863 - accuracy: 0.5600 - val_loss: 0.6856 - val_accuracy: 0.5636 - 3s/epoch - 4ms/step
Epoch 2/30
626/626 - 2s - loss: 0.6858 - accuracy: 0.5602 - val_loss: 0.6847 - val_accuracy: 0.5636 - 2s/epoch - 3ms/step
Epoch 3/30
626/626 - 2s - loss: 0.6852 - accuracy: 0.5602 - val_loss: 0.6844 - val_accuracy: 0.5636 - 2s/epoch - 4ms/step
Epoch 4/30
626/626 - 2s - loss: 0.6836 - accuracy: 0.5602 - val_loss: 0.6840 - val_accuracy: 0.5636 - 2s/epoch - 3ms/step
Epoch 5/30
626/626 - 2s - loss: 0.6801 - accuracy: 0.5677 - val_loss: 0.6795 - val_accuracy: 0.5780 - 2s/epoch - 3ms/step
Epoch 6/30
626/626 - 2s - loss: 0.6709 - accuracy: 0.5986 - val_loss: 0.6765 - val_accuracy: 0.5809 - 2s/epoch - 3ms/step
Epoch 7/30
626/626 - 2s - loss: 0.6536 - accuracy: 0.6450 - val_loss: 0.6776 - val_accuracy: 0.5832 - 2s/epoch - 3ms/step
Epoch 8/30
626/626 - 2s - loss: 0.6301 - accuracy: 0.6783 - val_loss: 0.6676 - val_accuracy: 0.5880 - 2s/epoch - 3ms/step
Epoch 9/30
626/626 - 2s 

In [20]:
sen=["Coworkers at bathroom sink locked in tense standoff over who is going to wash hands longer",
    "The covid cases are rising"]
seq=tokenizer.texts_to_sequences(sen)
padded=pad_sequences(seq,maxlen=max_length,padding=padding_type,truncating=trunc_type)
print(model.predict(padded))

[[0.98818344]
 [0.05000471]]
