In [288]:
import csv
import re
import sys
import time
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import metrics, tree
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_selection import chi2, SelectKBest

In [289]:
df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE, dtype=str, encoding = 'utf-8',
                 header=None, names=["instance", "text", "id", "sentiment", "is_sarcastic"])
df = shuffle(df)

In [290]:
text_data = np.array([])
# Read tweets
for text in df.text:
    text_data = np.append(text_data, text)

In [291]:
""" Functions for text pre-processing """


def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", " ", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
#     punctuations = r'''$!"&'()*+,-./:;<=>?[\]^`{|}~'''
#     no_punct = ""
#     for char in sample:
#         if char not in punctuations:
#             no_punct = no_punct + char
#     return no_punct
    return re.sub(r'[^\w\s\&\#\@\$\%\_]','',sample)

def myTokenizer(sample):
    """Customized tokenizer"""
    ################################## 1. Remove numbers
    ################################## 2. Remove auspoll thingy
    ################################## 3. Remove starts with au
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2 and not word.isdigit() and not word.startswith('#aus') and not word.startswith('au')]
    return new_words

def remove_stopwords_NLTK(sample):
    """Remove stopwords using NLTK"""
    stopWords = set(stopwords.words('english'))
    words = myTokenizer(sample)
    filteredText = ""
    for word in words:
        if word not in stopWords:
            filteredText = filteredText + word + " "
    return filteredText.rstrip()


def porter_stem(sample):
    """Stemming"""
    words = myTokenizer(sample)
    ps = PorterStemmer()
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + ps.stem(word) + " "
    return stemmed_text.rstrip()


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = sample.lower()
    sample = remove_stopwords_NLTK(sample)
    sample = remove_punctuation(sample)
    sample = porter_stem(sample)
    return sample


In [292]:
# try to use sklearn stop_words later
count = CountVectorizer(preprocessor=myPreprocessor, tokenizer=myTokenizer, max_features=1000)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())
size = len(count.vocabulary_)
print(len(count.vocabulary_))

['#4corner', '#7new', '#abc730', '#abcnews24', '#afpraid', '#agchatoz', '#alp', '#asylumseek', '#betterfutur', '#blackhol', '#bootturnbullout', '#brexit', '#budget2016', '#cfa', '#cfmeu', '#chafta', '#childcar', '#cleanenergi', '#climat', '#climatechang', '#coal', '#corrupt', '#csg', '#csiro', '#csirocut', '#dutton', '#educ', '#election2016', '#environ', '#et', '#faketradi', '#fraudband', '#gonski', '#greatbarrierreef', '#green', '#greens16', '#humanright', '#icac', '#inequ', '#insid', '#labor', '#laborlaunch', '#latelin', '#leadersdeb', '#liber', '#lnp', '#lnpfail', '#malcolm', '#malwar', '#marriageequ', '#medicar', '#nauru', '#nbn', '#nbnco', '#nbngate', '#ndi', '#negativegear', '#npc', '#nswpol', '#panamapap', '#parakeelia', '#peoplesforum', '#pmlive', '#polita', '#putlnplast', '#qanda', '#qldpol', '#refuge', '#renew', '#safeschool', '#savemedicar', '#scomo', '#spendomet', '#springst', '#ssm', '#stopstateterror', '#taxrort', '#tennew', '#thedrum', '#turnbul', '$$$', '$2', '$50b', '$

In [293]:
X = bag_of_words.toarray()
# creating target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y, text)
# First 1500 for training set, last 500 for test set
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, shuffle=False)

In [294]:
clf = MultinomialNB()
# clf = BernoulliNB()
model = clf.fit(X_train, y_train)
text_clf_red = Pipeline([('vect', CountVectorizer(preprocessor=myPreprocessor, tokenizer=myTokenizer)), 
                       ('reducer', SelectKBest(chi2, k=800)),
                       ('clf', MultinomialNB())
                       ])
model_new = text_clf_red.fit(text_data[:1500],Y[:1500])

In [295]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       10000       0.42      0.63      0.50        54
       10001       0.14      0.14      0.14        29
       10002       0.40      0.50      0.45        34
       10003       0.42      0.53      0.47        93
       10004       1.00      0.33      0.50         3
       10005       0.57      0.56      0.56        54
       10006       0.44      0.43      0.44        53
       10008       0.50      0.43      0.46        42
       10009       0.50      0.20      0.29         5
       10010       0.25      0.11      0.15        19
       10011       0.00      0.00      0.00         5
       10012       0.00      0.00      0.00         7
       10013       0.46      0.57      0.51        21
       10014       0.50      0.14      0.22         7
       10015       0.53      0.69      0.60        26
       10016       0.40      0.14      0.21        14
       10017       0.50      0.09      0.15        11
       10018       0.57    

  'precision', 'predicted', average, warn_for)


In [296]:
y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       10000       0.84      0.88      0.86       190
       10001       0.81      0.72      0.76       111
       10002       0.76      0.80      0.78        96
       10003       0.68      0.76      0.72       265
       10004       0.86      0.43      0.57        14
       10005       0.72      0.86      0.79       140
       10006       0.78      0.74      0.76       136
       10007       1.00      0.14      0.25         7
       10008       0.77      0.89      0.83       121
       10009       0.83      0.45      0.59        11
       10010       0.71      0.65      0.68        37
       10011       0.00      0.00      0.00         8
       10012       0.78      0.39      0.52        18
       10013       0.85      0.90      0.88        83
       10014       1.00      0.45      0.62        22
       10015       0.86      0.97      0.91        93
       10016       0.71      0.53      0.61        45
       10017       0.80    

  'precision', 'predicted', average, warn_for)


In [297]:
y_pred = model_new.predict(text_data[1500:])
print(classification_report(Y[1500:], y_pred))

              precision    recall  f1-score   support

       10000       0.47      0.54      0.50        54
       10001       0.38      0.28      0.32        29
       10002       0.50      0.53      0.51        34
       10003       0.31      0.67      0.43        93
       10004       1.00      0.33      0.50         3
       10005       0.54      0.46      0.50        54
       10006       0.64      0.30      0.41        53
       10008       0.58      0.60      0.59        42
       10009       0.00      0.00      0.00         5
       10010       0.00      0.00      0.00        19
       10011       0.00      0.00      0.00         5
       10012       0.00      0.00      0.00         7
       10013       0.57      0.57      0.57        21
       10014       0.50      0.14      0.22         7
       10015       0.76      0.73      0.75        26
       10016       0.67      0.14      0.24        14
       10017       1.00      0.09      0.17        11
       10018       0.57    

In [298]:
y_pred = model_new.predict(text_data[:1500])
print(classification_report(Y[:1500], y_pred))

              precision    recall  f1-score   support

       10000       0.76      0.72      0.74       190
       10001       0.70      0.47      0.56       111
       10002       0.71      0.72      0.72        96
       10003       0.47      0.79      0.59       265
       10004       1.00      0.50      0.67        14
       10005       0.66      0.69      0.67       140
       10006       0.68      0.50      0.58       136
       10007       1.00      0.86      0.92         7
       10008       0.75      0.83      0.79       121
       10009       1.00      0.27      0.43        11
       10010       0.87      0.54      0.67        37
       10011       1.00      0.50      0.67         8
       10012       0.82      0.50      0.62        18
       10013       0.88      0.80      0.84        83
       10014       1.00      0.73      0.84        22
       10015       0.82      0.86      0.84        93
       10016       0.79      0.24      0.37        45
       10017       1.00    