In [1]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None, names=["instance", "text", "id", "sentiment","is_sarcastic"])

In [2]:
df=shuffle(df)

In [3]:
text_data = np.array([])
for text in df.text:
    text_data = np.append(text_data,text)

In [4]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
    no_punct = ""
    for char in sample:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [5]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = False, tokenizer=myTokenizer ,max_features = 200)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())
print(count.vocabulary_)

['#AFPRaids', '#AusPol', '#AusVotes', '#Auspol', '#Ausvotes', '#Brexit', '#Election2016', '#LNP', '#LNPfail', '#Labor', '#Medicare', '#NBN', '#Parakeelia', '#auspol', '#ausvotes', '#ausvotes2016', '#insiders', '#npc', '#qanda', '@AustralianLabor', '@LiberalAus', '@RichardDiNatale', '@TurnbullMalcolm', '@billshortenmp', 'AFP', 'ALP', 'Abbott', 'Australia', 'Australian', 'Australians', 'Bill', 'Coalition', 'Dutton', 'Government', 'Govt', 'Greens', 'How', 'If', 'Its', 'Joyce', 'LNP', 'Labor', 'Labors', 'Liberal', 'Liberals', 'Libs', 'Malcolm', 'Medicare', 'Morrison', 'NBN', 'No', 'Not', 'PM', 'Party', 'Peter', 'Shorten', 'So', 'THE', 'The', 'This', 'Turnbull', 'Turnbulls', 'VOTE', 'Vote', 'We', 'What', 'Why', 'You', 'about', 'after', 'again', 'all', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'back', 'be', 'because', 'been', 'big', 'boats', 'budget', 'business', 'but', 'by', 'campaign', 'can', 'cant', 'care', 'could', 'cut', 'cuts', 'did', 'do', 'dont', 'down', 'economic', 'economy', 'e

In [6]:
X = bag_of_words.toarray()

In [7]:
#target classes
Y = np.array([])
for text in df.sentiment:
    Y = np.append(Y,text)

In [8]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]

In [9]:
#clf = MultinomialNB()
#clf = BernoulliNB()
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0) 
model = clf.fit(X_train, y_train)

In [10]:
predicted_y = model.predict(X_test)

In [11]:
print(accuracy_score(y_test, predicted_y))

0.656


In [12]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro', labels = np.unique(predicted_y)))
print(f1_score(y_test, predicted_y, average='macro', labels = np.unique(predicted_y)))
print(classification_report(y_test, predicted_y,output_dict= False, labels = np.unique(predicted_y)))

['negative' 'negative' 'negative' 'negative' 'neutral' 'negative'
 'negative' 'neutral' 'negative' 'positive' 'neutral' 'neutral' 'negative'
 'negative' 'positive' 'neutral' 'neutral' 'negative' 'negative'
 'positive' 'negative' 'neutral' 'neutral' 'negative' 'negative'
 'negative' 'positive' 'negative' 'negative' 'neutral' 'negative'
 'negative' 'negative' 'positive' 'neutral' 'negative' 'negative'
 'negative' 'neutral' 'negative' 'negative' 'negative' 'negative'
 'negative' 'neutral' 'negative' 'negative' 'negative' 'negative'
 'neutral' 'negative' 'neutral' 'neutral' 'neutral' 'neutral' 'negative'
 'neutral' 'negative' 'positive' 'neutral' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'neutral'
 'neutral' 'negative' 'negative' 'negative' 'negative' 'positive'
 'neutral' 'negative' 'negative' 'negative' 'negative' 'neutral'
 'positive' 'neutral' 'neutral' 'negative' 'positive' 'negative'
 'negative' 'negative' 'negative' 'neutral' 'negative' 'positive'