In [3]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from time import time
import re
import string
import os
import csv
from pprint import pprint
import collections

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


import warnings
warnings.filterwarnings('ignore')
np.random.seed(37)

In [4]:
df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE, dtype=str, encoding = 'utf-8',
                 header=None, names=["instance", "text", "id", "sentiment", "is_sarcastic"])

In [5]:
""" Functions for text pre-processing """


def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", " ", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    return re.sub(r'[^\w\s\&\#\@\$\%\_]','',sample)

def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2 and not word.startswith('au') and not word.startswith('#aus')]
    return new_words

def remove_stopwords_NLTK(sample):
    """Remove stopwords using NLTK"""
    stopWords = set(stopwords.words('english'))
    words = [w for w in sample.split(' ') if len(w) >= 2]
    filteredText = ""
    for word in words:
        if word not in stopWords:
            filteredText = filteredText + word + " "
    return filteredText.rstrip()

def remove_digits(input_text):
    return re.sub('\d+', '', input_text)

def porter_stem(sample):
    """Stemming"""
    words = [w for w in sample.split(' ') if len(w) >= 2]
    ps = PorterStemmer()
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + ps.stem(word) + " "
    return stemmed_text.rstrip()

def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    # sample = sample.lower()
    sample = remove_punctuation(sample)
    # sample = remove_digits(sample)
    return sample

In [6]:
""" Data creation """
text_data = np.array([])
# Read tweets
for text in df.text:
    text_data = np.append(text_data, text)
# creating target classes
Y = np.array([])
for text in df.sentiment:
    Y = np.append(Y, text)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(text_data, Y, test_size=0.25, shuffle=False)

In [8]:
# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None):
    
    pipeline = Pipeline([
        ('vect', vect)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5, scoring='f1_micro')
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))
                        
    return grid_search

In [13]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__min_df': (1,2,3,4,5),
    'vect__max_features': (200,400,600,700,800,1000,1200,1400),
    'vect__max_df': (0.2, 0.4, 0.6, 0.8 ,1.0)
#     'vect__preprocessor':(None, myPreprocessor),
#     'vect__tokenizer':(None, myTokenizer)
}


# Parameter grid settings for MultinomialNB
parameters_mnb = {
    'clf__alpha': (0.25, 0.5, 0.6 ,0.75, 1.0)
}

parameters_rf = {
    'clf__bootstrap': [True, False],
    'clf__max_depth': [60, 70, 80, 90, 100, None],
    'clf__max_features': ['auto', 'sqrt'],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__min_samples_split': [2, 5, 10],
    'clf__n_estimators': [600, 800, 1000]
}

In [12]:
# preprocessor=myPreprocessor, tokenizer= myTokenizer
best_mnb_countvect = grid_vect(MultinomialNB(), parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=CountVectorizer(preprocessor=myPreprocessor, tokenizer= myTokenizer))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.6, 0.75, 1.0),
 'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
 'vect__max_features': (200, 400, 600, 700, 800, 1000, 1200, 1400),
 'vect__min_df': (1, 2, 3, 4, 5),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 2428 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 3528 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 4828 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 6328 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 8028 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 9928 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  2.6min finished


done in 158.473s

Best CV score: 0.739
Best parameters set:
	clf__alpha: 1.0
	vect__max_df: 0.4
	vect__max_features: 1400
	vect__min_df: 1
	vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.748


Classification Report Test Data
              precision    recall  f1-score   support

    negative       0.80      0.89      0.84       335
     neutral       0.59      0.54      0.57       125
    positive       0.73      0.20      0.31        40

   micro avg       0.75      0.75      0.75       500
   macro avg       0.71      0.54      0.57       500
weighted avg       0.74      0.75      0.73       500



In [15]:
from sklearn.ensemble import RandomForestClassifier
best_random = grid_vect(RandomForestClassifier(), parameters_rf, X_train, X_test, parameters_text=None, vect=CountVectorizer(preprocessor=myPreprocessor, tokenizer= myTokenizer, max_features=700, ngram_range=(1, 1), min_df=4, max_df=0.2))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__bootstrap': [True, False],
 'clf__max_depth': [60, 70, 80, 90, 100, None],
 'clf__max_features': ['auto', 'sqrt'],
 'clf__min_samples_leaf': [1, 2, 4],
 'clf__min_samples_split': [2, 5, 10],
 'clf__n_estimators': [600, 800, 1000]}
Fitting 5 folds for each of 648 candidates, totalling 3240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 3240 out of 3240 | elapsed: 14.0min finished


done in 845.660s

Best CV score: 0.754
Best parameters set:
	clf__bootstrap: True
	clf__max_depth: 90
	clf__max_features: 'sqrt'
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2
	clf__n_estimators: 800
Test score with best_estimator_: 0.746


Classification Report Test Data
              precision    recall  f1-score   support

    negative       0.78      0.90      0.84       335
     neutral       0.59      0.49      0.54       125
    positive       0.91      0.25      0.39        40

   micro avg       0.75      0.75      0.75       500
   macro avg       0.76      0.55      0.59       500
weighted avg       0.74      0.75      0.73       500

