In [1]:
import numpy as np
import pandas as pd

In [2]:
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support

In [3]:
data = pd.read_json('data/Graduate - HEADLINES dataset (2019-06).json', lines=True)

In [4]:
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


# Preprocessing

Strip all non-letters and split to list:

In [5]:
data['headline'] = data['headline'].apply(lambda x : (re.sub('[^a-zA-Z]', ' ', x).split()))

In [6]:
data.head()

Unnamed: 0,headline,is_sarcastic
0,"[former, versace, store, clerk, sues, over, se...",0
1,"[the, roseanne, revival, catches, up, to, our,...",0
2,"[mom, starting, to, fear, son, s, web, series,...",1
3,"[boehner, just, wants, wife, to, listen, not, ...",1
4,"[j, k, rowling, wishes, snape, happy, birthday...",0


Lemmatization and stop words' removal:

In [7]:
lemmatizer = WordNetLemmatizer()
stops = stopwords.words('english')

data['headline'] = data['headline'].apply(lambda x : ' '.join([lemmatizer.lemmatize(word) for word in x if word not in stops]))

In [8]:
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sue secret black co...,0
1,roseanne revival catch thorny political mood b...,0
2,mom starting fear son web series closest thing...,1
3,boehner want wife listen come alternative debt...,1
4,j k rowling wish snape happy birthday magical way,0


Vectorize:

In [9]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data['headline']).toarray()

chi = SelectKBest(chi2, k=5000)
features = chi.fit_transform(features, data['is_sarcastic'])

Test/train split:

In [10]:
data['is_sarcastic'].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features, data['is_sarcastic'], test_size=0.15)

# Model

## Baseline

In [12]:
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='binary')

(0.42778085224128387, 0.4445083381253594, 0.43598420755781164, None)

## SVM

Train and evaluate:

In [13]:
svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [14]:
precision_recall_fscore_support(y_test, y_pred, average='binary')

(0.8419395465994962, 0.7688326624496837, 0.8037270814547639, None)

Additional check-up:

In [15]:
feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

top_features = 10
coef = svc.coef_.ravel()
top_positive_coefficients = np.argsort(coef)[-top_features:]
top_negative_coefficients = np.argsort(coef)[:top_features]

print('Ironic: ', feature_names[top_positive_coefficients])
print('Non-ironic: ',feature_names[top_negative_coefficients])

Ironic:  ['quickly' 'fuck' 'report' 'man' 'shit' 'fucking' 'clearly' 'introduces'
 'nation' 'area']
Non-ironic:  ['donald' 'allegedly' 'lgbtq' 'queer' 'trump' 'trans' 'hawaii'
 'transgender' 'accused' 'nsfw']
