# Sarcasm Classifier

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
df_sarcasm = pd.read_csv('reddit_sarcasm.csv')
df_non_sarcasm = pd.read_csv('reddit_serious.csv')

In [3]:
df_sarcasm.head()

Unnamed: 0,S.No,Text
0,0,"""Having sex with my girlfriend at least 5 time..."
1,1,"""Awesome case, plus those blue LEDs make your ..."
2,2,"""I don't know man. [This](http://www.reddit.co..."
3,3,"""because he is famous Edit: oh yeah /s"""
4,4,"""&gt; My deputies did their job to the fullest..."


In [4]:
df_non_sarcasm.head()

Unnamed: 0,S.No,Text
0,0,"""くそ 読みたいが買ったら負けな気がする 図書館に出ねーかな"""
1,1,"""gg this one's over. off to watch the NFL draf..."
2,2,"""Are you really implying we return to those ti..."
3,3,"""No one has a European accent either because ..."
4,4,"""That the kid ""..reminds me of Kevin."" so sa..."


In [5]:
del df_sarcasm['S.No']
del df_non_sarcasm['S.No']

In [6]:
X_sarcasm=df_sarcasm['Text']
X_non_sarcasm=df_non_sarcasm['Text']

# Text Preprocessing

In [7]:
print(X_sarcasm[0])
print("\n")
print(X_non_sarcasm[0])

"Having sex with my girlfriend at least 5 times a day is my main escape. I am FA because I would like a second girlfriend for regular threesomes but she doesn't want to. /s"


"くそ 読みたいが買ったら負けな気がする 図書館に出ねーかな"


In [8]:
# Removing quotes at the beginning and end of text

In [9]:
X_sarcasm=X_sarcasm.apply(lambda s:s.strip('\'"'))
X_non_sarcasm=X_non_sarcasm.apply(lambda s:s.strip('\'"'))

In [10]:
# Removing ' /s' at the end of sentences

In [11]:
X_sarcasm=X_sarcasm.apply(lambda s:s.replace(' /s',''))

In [12]:
# Removing URL's from the text

In [17]:
import re as re
X_sarcasm=X_sarcasm.apply(lambda s:re.sub(r'http\S+', '', s))
X_non_sarcasm=X_non_sarcasm.apply(lambda s:re.sub(r'http\S+', '', s))

In [18]:
# Removing words starting with '@' and '/u/' to remove tagged people

In [19]:
X_sarcasm=X_sarcasm.apply(lambda s:" ".join(filter(lambda s:s[0]!='@', s.split())))
X_sarcasm=X_sarcasm.apply(lambda s:" ".join(filter(lambda s:(s[0]!='/'), s.split())))
X_sarcasm=X_sarcasm.apply(lambda s:" ".join(s.split()))

X_non_sarcasm=X_non_sarcasm.apply(lambda s:" ".join(filter(lambda s:s[0]!='@', s.split())))
X_non_sarcasm=X_non_sarcasm.apply(lambda s:" ".join(filter(lambda s:(s[0]!='/'), s.split())))
X_non_sarcasm=X_non_sarcasm.apply(lambda s:" ".join(s.split()))

In [20]:
# Removing texts having less than 2 words

In [21]:
def length_greater_than_two(text):
    if(len(text.split()) > 2):
        return True
    else:
        return False

In [22]:
X_sarcasm=[x for x in X_sarcasm if(length_greater_than_two(x))]
X_non_sarcasm=[x for x in X_non_sarcasm if(length_greater_than_two(x))]

In [23]:
# Removing bad symbols and special characters

In [24]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z !"]')
REMOVE_NON_ASCII=re.compile(r'[^\x00-\x7F]+')

def text_prepare(text):
    text=re.sub(REPLACE_BY_SPACE_RE,' ',text)
    text=re.sub(BAD_SYMBOLS_RE,'',text)
    text=re.sub(REMOVE_NON_ASCII,'',text)
    text=' '.join(text.split())
    return text

In [25]:
X_sarcasm = [text_prepare(x) for x in X_sarcasm]
X_non_sarcasm= [text_prepare(x) for x in X_non_sarcasm]

In [26]:
# Removing non-english rows

In [27]:
import nltk
nltk.download('words')
nltk.download('wordnet')
from nltk.corpus import words
english_words=[w for w in words.words('en')]

[nltk_data] Downloading package words to C:\Users\varun
[nltk_data]     tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package wordnet to C:\Users\varun
[nltk_data]     tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [28]:
def is_a_english_text(text):
    countFalse=0
    text_list=text.split()
    for i in range(len(text_list)):
        if (text_list[i].lower() not in english_words):
            countFalse=countFalse+1       
    if(countFalse==len(text_list)):
        return False
    else:
        return True

In [29]:
X_sarcasm = [x for x in X_sarcasm if(is_a_english_text(x)==True)]
X_non_sarcasm = [x for x in X_non_sarcasm if(is_a_english_text(x)==True)]

In [30]:
#Lowecasing the texts which are not strictly in CAPITALS

In [31]:
def lowercase_text(text):
    text_list=text.split()
    for i in range(len(text_list)):
        if(not(text_list[i].isupper())):
            text_list[i]=text_list[i].lower()
    return ' '.join(text_list)

In [32]:
X_sarcasm = [lowercase_text(x) for x in X_sarcasm ]
X_non_sarcasm = [lowercase_text(x) for x in X_non_sarcasm ]

In [33]:
#Lemmatizing the text

In [34]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

def lemmatize(text):
    text_list=text.split()
    for i in range(len(text_list)):
        text_list[i]=lemmatizer.lemmatize(text_list[i])
    return ' '.join(text_list)

In [35]:
X_sarcasm = [lemmatize(x) for x in X_sarcasm]
X_non_sarcasm = [lemmatize(x) for x in X_non_sarcasm]

In [36]:
#Removing stopwords from text

In [37]:
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\varun
[nltk_data]     tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [38]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORDS])

In [39]:
X_sarcasm = [remove_stopwords(x) for x in X_sarcasm]
X_non_sarcasm = [remove_stopwords(x) for x in X_non_sarcasm]

In [40]:
#Creating a final clean, processed and shuffled dataset

In [41]:
print ("No. of non-sarcastic datapoints: %d" %(len(X_non_sarcasm)))
print ("No. of sarcastic datapoints: %d" %(len(X_sarcasm)))

No. of non-sarcastic datapoints: 8477
No. of sarcastic datapoints: 9591


In [42]:
sarcasm_dataset=pd.DataFrame({'text':X_sarcasm})
sarcasm_dataset['label']=1

In [43]:
non_sarcams_dataset=pd.DataFrame({'text':X_non_sarcasm})
non_sarcams_dataset['label']=0

In [44]:
final_dataset=sarcasm_dataset.append(non_sarcams_dataset,ignore_index=True)

In [93]:
final_dataset['text'].replace('',np.nan,inplace=True)
final_dataset.dropna(subset=['text'],inplace=True)

In [98]:
final_dataset=final_dataset.sample(frac=1).reset_index(drop=True)

In [100]:
final_dataset.to_csv('final_dataset.csv')

In [101]:
X=final_dataset['text']
Y=final_dataset['label']

In [102]:
#Splitting data into training and test data

In [103]:
from sklearn.model_selection import train_test_split

In [104]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [114]:
X_train=list(X_train)
X_test=list(X_test)
Y_train=list(Y_train)
Y_test=list(Y_test)

# Feature Extraction

In [937]:
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.sparse as ss

In [459]:
#Creating Tf-Idf Matrix

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [116]:
def tokenizer_function(text):
    return text.split()

In [794]:
def tfidf_features(X_train):
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.9,analyzer='word',tokenizer=tokenizer_function,ngram_range=(1,2))
    X_train=tfidf_vectorizer.fit_transform(X_train)
    
    return X_train

In [795]:
tfidf_train= tfidf_features(X_train)

In [796]:
tfidf_train.shape

(14448, 175153)

In [458]:
#Creating Sentiment Contrast Matrix

In [345]:
from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\varun
[nltk_data]     tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\varun tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [1102]:
class sentiment_contrast_score(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def score(self,text):
        length=len(text.split())
        words=text.split()
        if(length<=1):
            return 0
        else:
            score_first_half=0
            score_second_half=0
            text1=[]
            text2=[]
            for i in range (length):
                if(i<length/2):
                    text1.append(words[i])
                else:
                    text2.append(words[i])
            text1=' '.join(text1)
            text2=' '.join(text2)
            score_first_half=TextBlob(text1).sentiment.polarity
            score_second_half=TextBlob(text2).sentiment.polarity
            return abs(score_first_half-score_second_half)
        
    def transform(self, X): 
        return ss.csr_matrix([self.score(x) for x in X]).T
    
    def fit(self, df, y=None):
        return self
    
    def get_feature_names(self):
        return ['sentiment_contrast_score']

In [921]:
#Creating Interjection Word Start Matrix

In [905]:
nltk.download('brown')
nltk.download('nps_chat')

[nltk_data] Downloading package brown to C:\Users\varun
[nltk_data]     tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package nps_chat to C:\Users\varun
[nltk_data]     tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


True

In [908]:
chat_tag_dict=dict(nltk.corpus.nps_chat.tagged_words())

In [1101]:
class starts_with_interjection(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def is_interjection(self,text):
        words=nltk.word_tokenize(text)
        if(chat_tag_dict.get(words[0])=='UH'):
            return 1
        else:
            return 0
    
    def transform(self, X): 
        return ss.csr_matrix([self.is_interjection(x) for x in X]).T
    
    def fit(self, df, y=None):
        return self
    
    def get_feature_names(self):
        return ['is_interjection']

# Classification Using Features

In [1246]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# Using only tf-idf vectorizer

In [1005]:
# Logistic Regression Model

In [1181]:
pipeLine_tfidf_logistic_regression = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9,analyzer='word',tokenizer=tokenizer_function,ngram_range=(1,2),lowercase=False)),
    ('logit', LogisticRegression(penalty='l2',C=0.1)),
])

In [1182]:
pipeLine_tfidf_logistic_regression.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.9, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [1183]:
pipeLine_tfidf_logistic_regression.score(X_train,Y_train)

0.6940060908084164

In [1184]:
accuracy_tfidf_logistic_regression=pipeLine_tfidf_logistic_regression.score(X_test,Y_test)
f1_score_tfidf_logistic_regression=f1_score(y_true=Y_test,y_pred=pipeLine_tfidf_logistic_regression.predict(X_test))

print('Accuracy with tf-idf_logistic_regression model is: %f'%accuracy_tfidf_logistic_regression)
print('F-1 score with tf-idf_logistic_regression model is: %f'%f1_score_tfidf_logistic_regression)

Accuracy with tf-idf_logistic_regression model is: 0.655316
F-1 score with tf-idf_logistic_regression model is: 0.719658


In [1022]:
# SVM Model

In [1185]:
pipeline__tfidf_svm = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9,analyzer='word',tokenizer=tokenizer_function,ngram_range=(1,2),lowercase=False)),
    ('svm', svm.SVC(kernel='linear',C=0.1)),
])

In [1186]:
pipeline__tfidf_svm.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.9, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [1187]:
pipeline__tfidf_svm.score(X_train,Y_train)

0.6252768549280178

In [1188]:
accuracy_tfidf_svm=pipeline__tfidf_svm.score(X_test,Y_test)
f1_score_tfidf_svm=f1_score(y_true=Y_test,y_pred=pipeline__tfidf_svm.predict(X_test))
print('Accuracy with tf-idf_svm model is: %f'%accuracy_tfidf_svm)
print('F-1 score with tf-idf_svm model is: %f'%f1_score_tfidf_svm)

Accuracy with tf-idf_svm model is: 0.626523
F-1 score with tf-idf_svm model is: 0.720530


# Using all the features together

In [1223]:
pipeLine_all_features_logistic_regression = Pipeline([
    ('featureUnion', FeatureUnion([
        ('tfidf', TfidfVectorizer(max_df=0.9,analyzer='word',tokenizer=tokenizer_function,ngram_range=(1,2),lowercase=False)),
        ('sentimentContrast', sentiment_contrast_score()),('interjectionStart',starts_with_interjection())])),
    ('logit', LogisticRegression(penalty='l2',C=0.1)),
])

In [1224]:
pipeLine_all_features_logistic_regression.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('featureUnion', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.9, max_features=None, min_df=1,
        ng...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [1225]:
pipeLine_all_features_logistic_regression.score(X_train,Y_train)

0.7358803986710963

In [1226]:
accuracy_all_features_logistic_regression=pipeLine_all_features_logistic_regression.score(X_test,Y_test)
f1_score_all_features_logistic_regression=f1_score(y_true=Y_test,y_pred=pipeLine_all_features_logistic_regression.predict(X_test))

print('Accuracy with all_features_logistic_regression model is: %f'%accuracy_all_features_logistic_regression)
print('F-1 score with all_features_logistic_regression model is: %f'%f1_score_all_features_logistic_regression)

Accuracy with all_features_logistic_regression model is: 0.658361
F-1 score with all_features_logistic_regression model is: 0.704926


In [1227]:
# SVM Model

In [1228]:
pipeLine_all_features_svm = Pipeline([
    ('featureUnion', FeatureUnion([
        ('tfidf', TfidfVectorizer(max_df=0.9,analyzer='word',tokenizer=tokenizer_function,ngram_range=(1,3),lowercase=False)),
        ('sentimentContrast', sentiment_contrast_score()),('interjectionStart',starts_with_interjection())])),
    ('svm', svm.SVC(kernel='linear',C=0.1)),
])

In [1229]:
pipeLine_all_features_svm.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('featureUnion', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.9, max_features=None, min_df=1,
        ng...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [1230]:
pipeLine_all_features_svm.score(X_train,Y_train)

0.6400885935769657

In [1211]:
accuracy_all_features_svm=pipeLine_all_features_svm.score(X_test,Y_test)
f1_score_all_features_svm=f1_score(y_true=Y_test,y_pred=pipeLine_all_features_svm.predict(X_test))
print('Accuracy with all_features_svm model is: %f'%accuracy_all_features_svm)
print('F1-score with all_features_svm model is: %f'%f1_score_all_features_svm)

Accuracy with all_features_svm model is: 0.639258
F1-score with all_features_svm model is: 0.721283


# Top Features

In [1231]:
feature_dictionary=dict(zip(pipeLine_all_features_logistic_regression.named_steps['featureUnion'].get_feature_names()
                                       ,pipeLine_all_features_logistic_regression.named_steps['logit'].coef_[0]))

In [1232]:

feature_dictionary= dict(sorted(feature_dictionary.items(), key=lambda x: x[1], reverse=True))

In [1234]:
feature_dictionary

{'tfidf__forgot': 1.192115074727436,
 'tfidf__yeah': 1.0560854844921614,
 'tfidf__obviously': 0.8737190032817792,
 'tfidf__woman': 0.7300251294384557,
 'tfidf__totally': 0.7076724278136528,
 'tfidf__clearly': 0.698343080953594,
 'tfidf__gt': 0.6951040011830748,
 'tfidf__must': 0.6309972889593631,
 'tfidf__edit': 0.6102451133998784,
 'tfidf__dropped': 0.5978037441054367,
 'tfidf__right': 0.5940782483366422,
 'interjectionStart__is_interjection': 0.5935464969384036,
 'tfidf__everyone': 0.5179620685693269,
 'tfidf__know': 0.4799510868041542,
 'tfidf__im sure': 0.43646524984430496,
 'tfidf__guy': 0.4358327863115601,
 'tfidf__mean': 0.4286678736919486,
 'tfidf__dare': 0.38590744348020095,
 'tfidf__white': 0.3849846262899671,
 'tfidf__man': 0.37506034383893033,
 'tfidf__men': 0.3686802830799839,
 'tfidf__real': 0.36830586127509424,
 'tfidf__people': 0.3498747001304554,
 'tfidf__didnt': 0.33622004906184166,
 'tfidf__racist': 0.32738078291927797,
 'tfidf__I guess': 0.32293515425279773,
 'tfidf

In [1247]:
roc_auc_score(y_true=Y_test,y_score=pipeLine_all_features_logistic_regression.predict(X_test))

0.6494081884548295