**Importing Libraries**
---

In [1]:
import pandas as pd
import numpy as np
import os
import re
import seaborn as sns

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords

import string
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from plotly import tools
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
from PIL import Image

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Importing Data from JSON file**
---



In [2]:
import pandas as pd
import json

try:
    with open('Sarcasm_Headlines_Dataset.json') as f:
        data = json.load(f)
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e.msg}, line: {e.lineno}, column: {e.colno}")

df = pd.DataFrame(data)
print(df.head())


                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
2  https://local.theonion.com/mom-starting-to-fea...   
3  https://politics.theonion.com/boehner-just-wan...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   

                                            headline  is_sarcastic  
0  former versace store clerk sues over secret 'b...             0  
1  the 'roseanne' revival catches up to our thorn...             0  
2  mom starting to fear son's web series closest ...             1  
3  boehner just wants wife to listen, not come up...             1  
4  j.k. rowling wishes snape happy birthday in th...             0  


**Count of Given Values with BAR PLOT**
---

In [3]:
print(df['is_sarcastic'].value_counts())

0    14985
1    11724
Name: is_sarcastic, dtype: int64


In [4]:
labels = ['Sarcasm', 'No Sarcasm']
values = [df['is_sarcastic'].sum(), len(df) - df['is_sarcastic'].sum()]

trace0 = go.Bar(x = [labels[1]], y = [values[1]], name = 'No Sarcasm')
trace1 = go.Bar(x = [labels[0]], y = [values[0]], marker= {'color': '#00cc66'} , name = 'Sarcasm')


data = [trace0, trace1]

layout = go.Layout(title = 'Number of Sarcastic Articles',
                   width = 800,
                   height = 500,
                  yaxis= dict(title = 'Number of articles'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)
     

**DATA CLEANING - PreProcessing**
---


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def preprocessHeadline(headline):
    headline = headline.lower()

    # Remove URLs
    headline = re.sub(r'http\S+|www\S+', '', headline)

    # Remove numbers
    headline = re.sub(r'\d+', '', headline)

    # Remove punctuation and special characters
    headline = re.sub(r'[^\w\s]', '', headline)

    # Tokenize the text
    tokens = word_tokenize(headline)

    # Lemmatize tokens and remove stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]

    # Join tokens back into a string
    preprocessedHeadline = ' '.join(tokens)

    return preprocessedHeadline




**CLEANED DATA UPDATED**
---


In [6]:
df['lemmatizedHeadline'] = df['headline'].apply(preprocessHeadline)

**Original Data**

In [7]:
print("Original Data: ")
print(df.headline.head(5))

Original Data: 
0    former versace store clerk sues over secret 'b...
1    the 'roseanne' revival catches up to our thorn...
2    mom starting to fear son's web series closest ...
3    boehner just wants wife to listen, not come up...
4    j.k. rowling wishes snape happy birthday in th...
Name: headline, dtype: object


**Processed Data**

In [8]:
print("Processed Data: ")
print(df.lemmatizedHeadline.head(5))

Processed Data: 
0    former versace store clerk sue secret black co...
1    roseanne revival catch thorny political mood b...
2    mom starting fear son web series closest thing...
3    boehner want wife listen come alternative debt...
4     jk rowling wish snape happy birthday magical way
Name: lemmatizedHeadline, dtype: object


**Feature Extraction**
--

**BASELINE-1 : Contradiction in sentiment score**

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import senticnet

ModuleNotFoundError: No module named 'senticnet'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVC()
svm.fit(X_train, y_train)


**BASELINE-2 : n-gram Feature set**

**Uni-gram**

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
svm = SVC()

preProcessedHeadline = df['lemmatizedHeadline']
y = df.is_sarcastic

ngram1_vectorizer = CountVectorizer(ngram_range=(1, 1))  
X = ngram1_vectorizer.fit_transform(preProcessedHeadline)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

svm.fit(X_train, y_train)
accuracy1 = svm.score(X_test, y_test)
print("Uni-gram Accuracy : "+str(accuracy1))



Uni-gram Accuracy : 0.787220766254836


**Bi-gram**

In [19]:
ngram2_vectorizer = CountVectorizer(ngram_range=(1, 2))  
X = ngram2_vectorizer.fit_transform(preProcessedHeadline)

y = df.is_sarcastic
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

svm = SVC()
svm.fit(X_train, y_train)
accuracy2 = svm.score(X_test, y_test)
print("Bi-gram Accuracy : "+str(accuracy2))

Bi-gram Accuracy : 0.7814800948458754


**Tri-gram**

In [20]:
ngram3_vectorizer = CountVectorizer(ngram_range=(1, 3))  
X = ngram3_vectorizer.fit_transform(preProcessedHeadline)

y = df.is_sarcastic
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

svm = SVC()
svm.fit(X_train, y_train)
accuracy3 = svm.score(X_test, y_test)
print("Tri-gram Accuracy : "+str(accuracy3))

Tri-gram Accuracy : 0.7738795593111563


0.7872499732591721


0.7823296609263023


0.7738795593111563


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Assuming you have already trained your model and obtained predictions
predictions = model.predict(X_test)

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

# Print classification report
print("Classification Report:")
print(classification_report(y_test, predictions))
