In [246]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [141]:
nlp=spacy.load('en_core_web_lg')
stop_words = set(stopwords.words('english'))

In [177]:
# loading in the data
df=pd.read_json(r'C:\Users\MUKU\Desktop\Python\NLP\nlp datasets\Sarcasm_Headlines_Dataset_v2.json',lines=True)

# Checking if the data is balanced
df['is_sarcastic'].value_counts()

0    14985
1    13634
Name: is_sarcastic, dtype: int64

In [218]:
def cleanData(doc):
    doc=doc.lower()
    tokens=[tokens for tokens in nlp(doc)]
    tokens=[tokens for tokens in tokens if tokens.is_stop==False]
    tokens=[tokens for tokens in tokens if tokens.is_punct==False]
    tokens=' '.join(tokens.lemma_ for tokens in tokens)
    return tokens

In [219]:
df['headline_cleaned'] = df['headline'].apply(lambda x: cleanData(x))

In [235]:
# Separating X and y
X=df['headline']
y=df['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [236]:
# Counting and Tfidf
tfidf_vect=TfidfVectorizer()
X_train_tfidf=tfidf_vect.fit_transform(X_train)

In [237]:
# constructing the pipeline
svc_pipeline=Pipeline([('tfidf',TfidfVectorizer()),('svc',LinearSVC())])

In [238]:
# fitting and predicting
svc_pipeline.fit(X_train,y_train)
predictions=svc_pipeline.predict(X_test)

In [239]:
# Checking scores and metrics
def algorithm(algorithm):
    pipeline=Pipeline([('tfidf',TfidfVectorizer()),('algo',algorithm)])
    pipeline.fit(X_train,y_train)
    predictions=pipeline.predict(X_test)
    print(f'Accuracy Score is {round(accuracy_score(y_test,predictions),2)}')
    print('\n')
    print(f'{confusion_matrix(y_test,predictions)}')
    print('\n')
    print(f'{classification_report(y_test,predictions)}')

In [266]:
sgd=SGDClassifier()
svc=LinearSVC()
rfc=RandomForestClassifier()
clf=AdaBoostClassifier()
list_of_algos=[sgd,svc,rfc,clf]

In [248]:
algorithm(sgd)

Accuracy Score is 0.83


[[4057  859]
 [ 703 3826]]


              precision    recall  f1-score   support

           0       0.85      0.83      0.84      4916
           1       0.82      0.84      0.83      4529

    accuracy                           0.83      9445
   macro avg       0.83      0.84      0.83      9445
weighted avg       0.84      0.83      0.83      9445



In [267]:
# Trying out new sentences
X_test=pd.Series(['Light travels faster than sound. This is why some people appear bright until you hear them speak','It was then the lovable show met the leading inspection.','Its okay if you dont like me. Not everyone has good taste.','It was then the interesting appearance met the outlying lack.','You look good when your eyes are closed, but you look the best when my eyes are closed',"The finished chemistry can't carry the club.",'Mirrors cant talk, lucky for you they cant laugh either','What if the abundant permission ate the maximum?','If i had a dollar for every smart thing you say. i would be poor','The jagged character jails into the cloudy exercise.','Are you always so stupid or is today a special ocassion?','It was then the cooked boot met the wet shoe.','I feel so miserable without you, its almost like having you here','Did the scattered police really deserve the hurry?','If you find me offensive. Then i suggest you quit finding me'])
y_test=[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
X_test=X_test.apply(lambda x: cleanData(x))

In [269]:
[algorithm(algo) for algo in list_of_algos]

Accuracy Score is 0.73


[[4 3]
 [1 7]]


              precision    recall  f1-score   support

           0       0.80      0.57      0.67         7
           1       0.70      0.88      0.78         8

    accuracy                           0.73        15
   macro avg       0.75      0.72      0.72        15
weighted avg       0.75      0.73      0.73        15

Accuracy Score is 0.6


[[4 3]
 [3 5]]


              precision    recall  f1-score   support

           0       0.57      0.57      0.57         7
           1       0.62      0.62      0.62         8

    accuracy                           0.60        15
   macro avg       0.60      0.60      0.60        15
weighted avg       0.60      0.60      0.60        15





Accuracy Score is 0.53


[[1 6]
 [1 7]]


              precision    recall  f1-score   support

           0       0.50      0.14      0.22         7
           1       0.54      0.88      0.67         8

    accuracy                           0.53        15
   macro avg       0.52      0.51      0.44        15
weighted avg       0.52      0.53      0.46        15

Accuracy Score is 0.6


[[1 6]
 [0 8]]


              precision    recall  f1-score   support

           0       1.00      0.14      0.25         7
           1       0.57      1.00      0.73         8

    accuracy                           0.60        15
   macro avg       0.79      0.57      0.49        15
weighted avg       0.77      0.60      0.50        15



[None, None, None, None]