In [266]:
import csv
import re
import sys
import time
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn import metrics, tree
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_selection import chi2, SelectKBest
import warnings
warnings.filterwarnings('ignore')

In [267]:
df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE, dtype=str, encoding = 'utf-8',
                 header=None, names=["instance", "text", "id", "sentiment", "is_sarcastic"])
#df = shuffle(df)

In [268]:
""" Functions for text pre-processing """


def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", " ", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
#     punctuations = r'''$!"&'()*+,-./:;<=>?[\]^`{|}~'''
#     no_punct = ""
#     for char in sample:
#         if char not in punctuations:
#             no_punct = no_punct + char
#     return no_punct
    return re.sub(r'[^\w\s]',' ',sample)

def myTokenizer(sample):
    """Customized tokenizer"""
    ################################## 1. Remove numbers
    ################################## 2. Remove auspoll thingy
    ################################## 3. Remove starts with au
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2 and not word.startswith('au')] #and not bool(re.search(r'\d',word))]
    return new_words

def remove_stopwords_NLTK(sample):
    """Remove stopwords using NLTK"""
    stopWords = set(stopwords.words('english'))
    words = [w for w in sample.split(' ') if len(w) >= 2]
    filteredText = ""
    for word in words:
        if word not in stopWords:
            filteredText = filteredText + word + " "
    return filteredText.rstrip()


def porter_stem(sample):
    """Stemming"""
    words = [w for w in sample.split(' ') if len(w) >= 2]
    ps = PorterStemmer()
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + ps.stem(word) + " "
    return stemmed_text.rstrip()

def lemmy(sample):
    #nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    words = [w for w in sample.split(' ') if len(w) >= 2]
    lemmed_text = ""
    for word in words:
        lemmed_text = lemmed_text + lemmatizer.lemmatize(word, pos='v') + " "
    return lemmed_text.rstrip()
    
def snowball(sample):
    words = [w for w in sample.split(' ') if len(w) >= 2]
    stemmer = SnowballStemmer("english")
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + stemmer.stem(word) + " "
    return stemmed_text.rstrip()

# def myPreprocessor(sample):
#     """Customized preprocessor"""
#     sample = remove_URL(sample)
#     sample = sample.lower()
#     sample = remove_punctuation(sample)
#     sample = remove_stopwords_NLTK(sample)
#     sample = porter_stem(sample)
#     return sample


In [269]:
def remove_mentions(input_text):
    return re.sub(r'@\w+', '', input_text)

def remove_urls(input_text):
    return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)

def emoji_oneword(input_text):
    # By compressing the underscore, the emoji is kept as one word
    return input_text.replace('_','')

def remove_punctuation1(input_text):
    # Make translation table
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
    return input_text.translate(trantab)

def remove_digits(input_text):
    return re.sub('\d+', '', input_text)

def to_lower(input_text):
    return input_text.lower()

def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    # whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 1] 
    return " ".join(clean_words) 

def stemming(input_text):
    porter = PorterStemmer()
    words = input_text.split() 
    stemmed_words = [porter.stem(word) for word in words]
    return " ".join(stemmed_words)

def newProcess(sample):
    sample = remove_mentions(sample)
    sample = remove_urls(sample)
    sample = remove_punctuation(sample)
    sample = remove_digits(sample)
    sample = to_lower(sample)
    sample = remove_stopwords(sample)
    sample = stemming(sample)
    return sample

In [270]:
""" Data creation """
text_data = np.array([])
# Read tweets
for text in df.text:
    text_data = np.append(text_data, text)
# creating target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y, text)

In [271]:
X_train_, X_test_, y_train, y_test = train_test_split(text_data, Y, test_size=0.25, shuffle=False)

In [272]:
# try to use sklearn stop_words later
# 711, 0.688
# 1178, 0.978
# max_features=818, ngram_range=(1, 2), min_df = 0
count = CountVectorizer(preprocessor=newProcess, tokenizer=myTokenizer, max_features=700, ngram_range=(1, 1), min_df = 4, max_df = 0.2)
X_train = count.fit_transform(X_train_).toarray()
X_test = count.transform(X_test_).toarray()
print(count.get_feature_names())
# size = len(count.vocabulary_)
print(len(count.vocabulary_))

['aaa', 'abbott', 'abc', 'abcnew', 'abl', 'abort', 'absolut', 'abus', 'account', 'action', 'actual', 'ad', 'admit', 'adopt', 'afford', 'afp', 'afpraid', 'age', 'agenda', 'agent', 'agil', 'aid', 'allow', 'alon', 'alp', 'alreadi', 'also', 'alway', 'amp', 'analysi', 'announc', 'anoth', 'answer', 'anti', 'anyon', 'anyth', 'around', 'ask', 'asylum', 'asylumseek', 'attack', 'avoid', 'away', 'back', 'ban', 'bank', 'banker', 'barnabi', 'barrier', 'base', 'becom', 'behind', 'believ', 'benefit', 'best', 'better', 'betterfutur', 'big', 'biggest', 'bil', 'bill', 'billion', 'bird', 'bishop', 'bit', 'black', 'blame', 'bloodi', 'blow', 'bn', 'boat', 'border', 'bottom', 'bowen', 'break', 'brexit', 'bribe', 'britain', 'broadband', 'broken', 'budget', 'build', 'bulk', 'busi', 'buy', 'cabinet', 'call', 'camp', 'campaign', 'canberra', 'candid', 'cannot', 'carbon', 'care', 'cash', 'cattl', 'caus', 'cayman', 'centr', 'ceo', 'cfa', 'cfmeu', 'chafta', 'chanc', 'chang', 'chao', 'check', 'child', 'childcar', 'c

In [273]:
clf = MultinomialNB(alpha = 0.75)
model = clf.fit(X_train, y_train)

In [274]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       10000       0.60      0.64      0.62        56
       10001       0.36      0.28      0.31        36
       10002       0.62      0.48      0.55        31
       10003       0.36      0.57      0.44        87
       10004       0.00      0.00      0.00         2
       10005       0.62      0.67      0.65        52
       10006       0.38      0.39      0.38        44
       10007       0.00      0.00      0.00         2
       10008       0.63      0.74      0.68        46
       10009       0.00      0.00      0.00         4
       10010       0.22      0.18      0.20        11
       10011       0.00      0.00      0.00         7
       10012       0.00      0.00      0.00         4
       10013       0.77      0.62      0.69        37
       10014       0.00      0.00      0.00         6
       10015       0.57      0.67      0.62        24
       10016       0.25      0.07      0.11        14
       10017       0.00    

In [275]:
y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       10000       0.83      0.81      0.82       188
       10001       0.77      0.69      0.73       104
       10002       0.69      0.80      0.74        99
       10003       0.69      0.75      0.72       271
       10004       1.00      0.60      0.75        15
       10005       0.73      0.83      0.78       142
       10006       0.84      0.74      0.78       145
       10007       0.00      0.00      0.00         5
       10008       0.79      0.81      0.80       117
       10009       0.73      0.67      0.70        12
       10010       0.56      0.71      0.63        45
       10011       1.00      0.17      0.29         6
       10012       0.78      0.67      0.72        21
       10013       0.78      0.85      0.81        67
       10014       0.92      0.52      0.67        23
       10015       0.81      0.96      0.88        95
       10016       0.68      0.60      0.64        45
       10017       0.82    

In [283]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model2 = clf2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       10000       0.82      0.41      0.55        56
       10001       0.00      0.00      0.00        36
       10002       0.00      0.00      0.00        31
       10003       0.19      0.99      0.31        87
       10004       0.00      0.00      0.00         2
       10005       0.00      0.00      0.00        52
       10006       1.00      0.07      0.13        44
       10007       0.00      0.00      0.00         2
       10008       0.60      0.07      0.12        46
       10009       0.00      0.00      0.00         4
       10010       0.00      0.00      0.00        11
       10011       0.00      0.00      0.00         7
       10012       0.00      0.00      0.00         4
       10013       0.00      0.00      0.00        37
       10014       0.00      0.00      0.00         6
       10015       0.00      0.00      0.00        24
       10016       0.00      0.00      0.00        14
       10017       0.00    

In [277]:
clf.score(X_test,y_test)

0.49

In [278]:
# y_pred = model_new.predict(text_data[1500:])
# print(classification_report(Y[1500:], y_pred))

In [281]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
clf = make_pipeline(CountVectorizer(preprocessor=newProcess, tokenizer=myTokenizer, max_features=700, ngram_range=(1, 1), min_df = 4, max_df = 0.2), MultinomialNB(alpha = 0.75))
scores = cross_val_score(clf,text_data,Y,cv=5,scoring = 'f1_micro')
print(scores)
print("F1 micro Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.48648649 0.46419753 0.50374065 0.50377834 0.52307692]
F1 micro Accuracy: 0.50 (+/- 0.04)


In [280]:
# y_pred = model_new.predict(text_data[:1500])
# print(classification_report(Y[:1500], y_pred))