In [398]:
import csv
import re
import sys
import time
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn import metrics, tree
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_selection import chi2, SelectKBest

In [399]:
df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE, dtype=str, encoding = 'utf-8',
                 header=None, names=["instance", "text", "id", "sentiment", "is_sarcastic"])

In [400]:
""" Functions for text pre-processing """


def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    return re.sub(r'[^\w\s\@\#]','',sample)

def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2 and not word.lower().startswith('au') and not word.lower().startswith('#aus')]
    return new_words

def remove_stopwords_NLTK(sample):
    """Remove stopwords using NLTK"""
    stopWords = set(stopwords.words('english'))
    words = [w for w in sample.split(' ') if len(w) >= 2]
    filteredText = ""
    for word in words:
        if word not in stopWords:
            filteredText = filteredText + word + " "
    return filteredText.rstrip()

def remove_digits(input_text):
    return re.sub('\d+', '', input_text)

def porter_stem(sample):
    """Stemming"""
    words = [w for w in sample.split(' ') if len(w) >= 2]
    ps = PorterStemmer()
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + ps.stem(word) + " "
    return stemmed_text.rstrip()

def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


In [401]:
""" Data creation """
text_data = np.array([])
# Read tweets
for text in df.text:
    text_data = np.append(text_data, text)
# creating target classes
Y = np.array([])
for text in df.sentiment:
    Y = np.append(Y, text)

In [402]:
X_train_, X_test_, y_train, y_test = train_test_split(text_data, Y, test_size=0.25, shuffle=False)

In [403]:
count = CountVectorizer(preprocessor=myPreprocessor, tokenizer = myTokenizer, max_features=1400, ngram_range=(1,1), min_df = 1, max_df = 0.4)
X_train = count.fit_transform(X_train_).toarray()
X_test = count.transform(X_test_).toarray()
print(count.get_feature_names())



In [404]:
clf = MultinomialNB(alpha = 1.0)
model = clf.fit(X_train, y_train)

In [405]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.80      0.89      0.84       335
     neutral       0.59      0.54      0.56       125
    positive       0.75      0.23      0.35        40

   micro avg       0.75      0.75      0.75       500
   macro avg       0.71      0.55      0.58       500
weighted avg       0.74      0.75      0.73       500



In [406]:
y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.94      0.92       959
     neutral       0.83      0.81      0.82       428
    positive       0.93      0.67      0.78       113

   micro avg       0.88      0.88      0.88      1500
   macro avg       0.88      0.81      0.84      1500
weighted avg       0.88      0.88      0.88      1500



In [407]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
clf = make_pipeline(CountVectorizer(preprocessor=myPreprocessor, tokenizer = myTokenizer, max_features=1400, ngram_range=(1,1), min_df = 1, max_df = 0.4), MultinomialNB(alpha =  1.0))
scores = cross_val_score(clf,text_data,Y,cv=5,scoring = 'f1_micro')
print(scores)
print("F1 micro Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.76309227 0.72568579 0.73566085 0.74185464 0.74120603]
F1 micro Accuracy: 0.74 (+/- 0.02)


In [408]:
# y_pred = model_new.predict(text_data[:1500])
# print(classification_report(Y[:1500], y_pred))