In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.io.json import json_normalize

In [50]:
with open('Graduate - HEADLINES dataset (2019-06).json', 'r') as jsonFile:
    df = pd.read_json(jsonFile, lines=True)

In [55]:
df.dtypes

headline        object
is_sarcastic     int64
dtype: object

In [61]:
df.groupby('is_sarcastic').count()

Unnamed: 0_level_0,headline
is_sarcastic,Unnamed: 1_level_1
0,14985
1,11724


In [60]:
print(df.describe())
df.isna().sum()

is_sarcastic
count  26709.000000
mean       0.438953
std        0.496269
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000


headline        0
is_sarcastic    0
dtype: int64

In [62]:
df.head(20)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
5,advancing the world's women,0
6,the fascinating case for eating lab-grown meat,0
7,"this ceo will send your kids to school, if you...",0
8,top snake handler leaves sinking huckabee camp...,1
9,friday's morning email: inside trump's presser...,0


In [85]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [86]:
pipe_sgdc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
    alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])
para_sgdc = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (1,3)),  # unigrams, bigrams or trigrams
    'vect__stop_words': ('english', None),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80),
}

In [91]:
sk = StratifiedKFold(n_splits=5, shuffle=True)
X = df['headline']
Y =  df['is_sarcastic']
for train, test in sk.split(X, Y):
    pipe_sgdc.fit(X[train], Y[train])
    predicted = pipe_sgdc.predict(X[test])
    print(metrics.classification_report(Y[test], predicted))

precision    recall  f1-score   support

           0       0.78      0.90      0.84      2997
           1       0.84      0.67      0.75      2345

    accuracy                           0.80      5342
   macro avg       0.81      0.79      0.79      5342
weighted avg       0.81      0.80      0.80      5342

              precision    recall  f1-score   support

           0       0.79      0.90      0.84      2997
           1       0.84      0.69      0.76      2345

    accuracy                           0.80      5342
   macro avg       0.81      0.79      0.80      5342
weighted avg       0.81      0.80      0.80      5342

              precision    recall  f1-score   support

           0       0.79      0.89      0.84      2997
           1       0.83      0.69      0.75      2345

    accuracy                           0.80      5342
   macro avg       0.81      0.79      0.80      5342
weighted avg       0.81      0.80      0.80      5342

              precision    recall

In [79]:
gs_clf = GridSearchCV(pipe_sgdc, para_sgdc, cv=5, n_jobs=-2, verbose=True)
gs_clf = gs_clf.fit(df['headline'], df['is_sarcastic'])

Fitting 5 folds for each of 3456 candidates, totalling 17280 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-2)]: Done 186 tasks      | elapsed:   51.2s
[Parallel(n_jobs=-2)]: Done 436 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-2)]: Done 786 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-2)]: Done 1236 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-2)]: Done 1786 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-2)]: Done 2436 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-2)]: Done 3186 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-2)]: Done 4036 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-2)]: Done 4986 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-2)]: Done 6036 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-2)]: Done 7186 tasks      | elapsed: 34.4min
[Parallel(n_jobs=-2)]: Done 8436 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-2)]: Done 9786 tasks     

In [80]:
print(gs_clf.best_score_)
print(gs_clf.cv_results_)

__ngram_range': (1, 3), 'vect__stop_words': None}, {'clf__alpha': 1e-06, 'clf__max_iter': 80, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 1.0, 'vect__max_features': 10000, 'vect__ngram_range': (1, 1), 'vect__stop_words': 'english'}, {'clf__alpha': 1e-06, 'clf__max_iter': 80, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 1.0, 'vect__max_features': 10000, 'vect__ngram_range': (1, 1), 'vect__stop_words': None}, {'clf__alpha': 1e-06, 'clf__max_iter': 80, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 1.0, 'vect__max_features': 10000, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}, {'clf__alpha': 1e-06, 'clf__max_iter': 80, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 1.0, 'vect__max_features': 10000, 'vect__ngram_range': (1, 2), 'vect__stop_words': None}, {'clf__alpha': 1e-06, 'clf__max_iter': 80, 

In [84]:
print(gs_clf.best_estimator_.get_params())

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=80,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
              verbose=0, warm_start=False))], 'verbose': False, 'vect': C

In [82]:
gs_clf.best_score_

0.8607209642690486