In [1]:
#importing library


#tabular data manipulation
import numpy as np
import pandas as pd


#visualization library
import matplotlib.pyplot as plt
import seaborn as sns

#neural network machine learning library
import tensorflow as tf

#accuracy checking 

from sklearn.metrics import confusion_matrix,classification_report

#library to split the dataset into train and test set
from sklearn.model_selection import train_test_split


In [2]:
#loading the dataset
df=pd.read_json('/kaggle/input/sarcasm-detection-through-nlp/Sarcasm_Headlines_Dataset.json',lines=True)


#showing the dataset
df

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


In [3]:
#checking the basic informatin 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [4]:
#creating preprocessing 
def preprocess_input(df):
    df=df.copy()
    #droping the article_link column
    df=df.drop('article_link',axis=1)
    #splitting the data into x and y
    x=df['headline']
    y=df['is_sarcastic']
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)
    
    return x_train,x_test,y_train,y_test
    

In [5]:
x_train,x_test,y_train,y_test=preprocess_input(df)

In [6]:
x_train

21906       don king enjoys grandilomentitudinous sandwich
24781    vital info on iraqi chemical weapons provided ...
22552    hacker releases new 'orange is the new black' ...
17857                      how boredom can lead to failure
20032                                mind the (gender) gap
                               ...                        
10955    5 ways to outsmart the supermarket and lose we...
17289    reflecting on the aids epidemic this gay men's...
5192     arianna joins payoff to 'reshape' financial se...
12172    rashida jones pays homage to the '90s with 'fl...
235      how to rebuild your credit after bankruptcy --...
Name: headline, Length: 18696, dtype: object

In [7]:
#creating a instance of tokenizer class
tokenizer=tf.keras.preprocessing.text.Tokenizer()

#assigning the attribute to a class

tokenizer.fit_on_texts(x_train)

In [8]:
x_train=tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(x_train),maxlen=40,padding='post')

In [9]:
x_test=tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(x_test),maxlen=40,padding='post')

In [10]:
x_train

array([[ 3579,   481,  1858, ...,     0,     0,     0],
       [ 6496,  4001,     7, ...,     0,     0,     0],
       [ 8490,   343,    11, ...,     0,     0,     0],
       ...,
       [ 4518,  2106, 24841, ...,     0,     0,     0],
       [24843,  1042,  1441, ...,     0,     0,     0],
       [   24,     1,  5280, ...,     0,     0,     0]], dtype=int32)

# Training the Model

In [11]:
inputs=tf.keras.Input(shape=(40,))
x=tf.keras.layers.Embedding(input_dim=24846,output_dim=64)(inputs)

x=tf.keras.layers.Flatten()(x)
x=tf.keras.layers.Dense(128,activation='relu')(x)
x=tf.keras.layers.Dense(128,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='sigmoid')(x)

2022-12-28 03:44:40.859069: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [12]:
model=tf.keras.Model(inputs=inputs,outputs=outputs)

In [13]:
#comiling the model

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy',
                      tf.keras.metrics.AUC(name='auc')])

In [14]:
print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 40)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 64)            1590144   
_________________________________________________________________
flatten (Flatten)            (None, 2560)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               327808    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 1,934,593
Trainable params: 1,934,593
Non-trainable params: 0
___________________________________________________

In [17]:
history=model.fit(x_train,y_train,validation_split=0.2,
                batch_size=32,epochs=100,callbacks=[
                    tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss',
                    patience=3,
                    restore_best_weights=True)
                ])

2022-12-28 03:49:17.205469: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


# Results

In [20]:
results=model.evaluate(x_test,y_test,verbose=0)
print('Acuracy:{:2f}%'.format(results[1])*100)


Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.861225%Acuracy:0.8612

In [15]:
inputs

<KerasTensor: shape=(None, 40) dtype=float32 (created by layer 'input_1')>