In [None]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') 
from nltk.corpus import wordnet
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd, numpy as np, re, time
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Loading data from json file
data = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines = True)

In [None]:
print(data.isnull().any(axis = 0))

is_sarcastic    False
headline        False
article_link    False
dtype: bool


In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
data['headline'] = data['headline'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
# getting features and labels
features = data['headline']
labels = data['is_sarcastic']

In [None]:
print(features)

0        thirtysomething scientists unveil doomsday clo...
1        dem rep  totally nails why congress is falling...
2        eat your veggies    deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word  strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object


In [None]:
def lemm(l):
  ans = []
  for i in range(len(l)):
    y = nltk.pos_tag(nltk.word_tokenize(l[i]))
    ans.append(y)
  return(ans)
#print(l[0])    
#print(lemm(l))

def pos_tagger(nltk_tag): 
    if nltk_tag.startswith('J'): 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'): 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'): 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'): 
        return wordnet.ADV 
    else:           
        return None

ans = []
list(filter(lambda x: ans.append(nltk.pos_tag(nltk.word_tokenize(x))), list(features)))
#print(ans)

for i in range(len(ans)):
  for j in range(len(ans[i])):
    ans[i][j] = (ans[i][j][0],pos_tagger(ans[i][j][1]))

#print(ans)

In [None]:
lm = WordNetLemmatizer()
featureslm = []
#list(filter(lambda x: featureslm.append(nltk.pos_tag(nltk.word_tokenize(y, i) for y,i in x)), ans))

def sent(l):
  ans = ""
  for i in l:
    if(i[1]):
      ans += lm.lemmatize(i[0], i[1])
    else:
      ans += lm.lemmatize(i[0])
    ans += ' '
  return(ans[:-1])

for i in ans:
  featureslm.append(sent(i))

#featureslm = features.apply(lambda x : ' '.join([lm.lemmatize(word, tag) for word in x]))

In [None]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
features = list(featureslm)
features = tv.fit_transform(features).toarray()

In [None]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .20, random_state = 0)

In [None]:
# model 1:-
# Using linear support vector classifier
lsvc = LinearSVC()
# training the model
lsvc.fit(features_train, labels_train)
# getting the score of train and test data
print(lsvc.score(features_train, labels_train)) # 90.93
print(lsvc.score(features_test, labels_test))   # 83.75
# model 2:-
# Using Gaussuan Naive Bayes
gnb = GaussianNB()
gnb.fit(features_train, labels_train)
print(gnb.score(features_train, labels_train))  # 78.86
print(gnb.score(features_test, labels_test))    # 73.80
# model 3:-
# Logistic Regression
lr = LogisticRegression()
lr.fit(features_train, labels_train)
print(lr.score(features_train, labels_train))   # 88.16
print(lr.score(features_test, labels_test))     # 83.08
# model 4:-
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print(rfc.score(features_train, labels_train))  # 98.82
print(rfc.score(features_test, labels_test))    # 79.71


0.913212491810439
0.8232005590496156
0.8044551212055033
0.7299091544374563
0.8798427604280411
0.8242487770789657
0.9886001310329766
0.7854647099930119
