In [29]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from time import time
import re
import string
import os
import csv
from pprint import pprint
import collections

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


import warnings
warnings.filterwarnings('ignore')
np.random.seed(37)

In [19]:
df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE, dtype=str, encoding = 'utf-8',
                 header=None, names=["instance", "text", "id", "sentiment", "is_sarcastic"])

In [20]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", " ", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    return re.sub(r'[^\w\s\&\#\@\$\%\_]', '', sample)


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2 and not word.isdigit() and not word.startswith('#aus') and not word.startswith('au')]
    return new_words


def remove_stopwords_NLTK(sample):
    """Remove stopwords using NLTK"""
    stopWords = set(stopwords.words('english'))
    words = [w for w in sample.split(' ') if len(w) >= 2]
    filteredText = ""
    for word in words:
        if word not in stopWords:
            filteredText = filteredText + word + " "
    return filteredText.rstrip()


def porter_stem(sample):
    """Stemming"""
    words = [w for w in sample.split(' ') if len(w) >= 2]
    ps = PorterStemmer()
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + ps.stem(word) + " "
    return stemmed_text.rstrip()


def lemmy(sample):
    lemmatizer = WordNetLemmatizer()
    words = [w for w in sample.split(' ') if len(w) >= 2]
    lemmed_text = ""
    for word in words:
        lemmed_text = lemmed_text + lemmatizer.lemmatize(word, pos='v') + " "
    return lemmed_text.rstrip()


def snowball(sample):
    words = [w for w in sample.split(' ') if len(w) >= 2]
    stemmer = SnowballStemmer("english")
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + stemmer.stem(word) + " "
    return stemmed_text.rstrip()


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = sample.lower()
    sample = remove_stopwords_NLTK(sample)
    sample = remove_punctuation(sample)
    #sample = lemmy(sample)
    sample = porter_stem(sample)
    return sample

In [21]:
def remove_mentions(input_text):
    return re.sub(r'@\w+', '', input_text)

def remove_urls(input_text):
    return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)

def emoji_oneword(input_text):
    # By compressing the underscore, the emoji is kept as one word
    return input_text.replace('_','')

def remove_punctuation(input_text):
    # Make translation table
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
    return input_text.translate(trantab)

def remove_digits(input_text):
    return re.sub('\d+', '', input_text)

def to_lower(input_text):
    return input_text.lower()

def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

def stemming(input_text):
    porter = PorterStemmer()
    words = input_text.split() 
    stemmed_words = [porter.stem(word) for word in words]
    return " ".join(stemmed_words)

def newProcess(sample):
    sample = remove_mentions(sample)
    sample = remove_urls(sample)
    sample = emoji_oneword(sample)
    sample = remove_punctuation(sample)
    sample = remove_digits(sample)
    sample = to_lower(sample)
    sample = remove_stopwords(sample)
    sample = stemming(sample)
    return sample

In [12]:
""" Data creation """
text_data = np.array([])
# Read tweets
for text in df.text:
    text_data = np.append(text_data, text)
# creating target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y, text)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(text_data, Y, test_size=0.25, shuffle=False)

In [59]:
# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None):
    
    pipeline = Pipeline([
        ('vect', vect)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=2, scoring='f1_micro')
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))
                        
    return grid_search

In [52]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__min_df': (1,2,3,4,5),
    'vect__max_features': (600,700,800,1000,1200,1400),
    'vect__max_df': (0.2, 0.4, 0.6, 0.8 ,1.0),
    'vect__preprocessor': (None, myPreprocessor),
    'vect__tokenizer': (None, myTokenizer)
}


# Parameter grid settings for MultinomialNB
parameters_mnb = {
    'clf__alpha': (0.25, 0.5, 0.6 ,0.75, 1.0)
}

parameters_rf = {
    #'clf__bootstrap': [True, False],
    'clf__max_depth': [60, 70, 80, 90, 100, 200,None],
    'clf__max_features': ['auto', 'sqrt'],
    'clf__min_samples_leaf': [1, 2, 4, 8],
    'clf__min_samples_split': [2, 5, 10],
    'clf__n_estimators': [200,400,600, 800, 1000]
}

In [53]:
# preprocessor=myPreprocessor, tokenizer= myTokenizer
#best_mnb_countvect = grid_vect(MultinomialNB(), parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=CountVectorizer(preprocessor=newProcess, tokenizer= myTokenizer))

In [54]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [61]:
best_random = grid_vect(RandomForestClassifier(), parameters_rf, X_train, X_test, parameters_text=None, vect=CountVectorizer(preprocessor=newProcess, tokenizer= myTokenizer, max_features=1000))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__max_depth': [60, 70, 80, 90, 100, 200, None],
 'clf__max_features': ['auto', 'sqrt'],
 'clf__min_samples_leaf': [1, 2, 4, 8],
 'clf__min_samples_split': [2, 5, 10],
 'clf__n_estimators': [200, 400, 600, 800, 1000]}
Fitting 2 folds for each of 840 candidates, totalling 1680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1680 out of 1680 | elapsed:  9.2min finished


done in 554.373s

Best CV score: 0.549
Best parameters set:
	clf__max_depth: 80
	clf__max_features: 'auto'
	clf__min_samples_leaf: 2
	clf__min_samples_split: 5
	clf__n_estimators: 200
Test score with best_estimator_: 0.570


Classification Report Test Data
              precision    recall  f1-score   support

       10000       0.72      0.70      0.71        56
       10001       0.48      0.42      0.45        36
       10002       0.53      0.58      0.55        31
       10003       0.36      0.61      0.45        87
       10004       0.00      0.00      0.00         2
       10005       0.79      0.73      0.76        52
       10006       0.54      0.43      0.48        44
       10007       0.00      0.00      0.00         2
       10008       0.73      0.80      0.76        46
       10009       0.67      1.00      0.80         4
       10010       0.70      0.64      0.67        11
       10011       0.50      0.14      0.22         7
       10012       0.50      0.50      0