In [5]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None, names=["instance", "text", "id", "sentiment","is_sarcastic"])

In [6]:
df=shuffle(df)
df.head(3)

Unnamed: 0,instance,text,id,sentiment,is_sarcastic
70,71,When Turnbull babbles on about science &amp; i...,10012,negative,False
1951,1952,? VOTE LNP OUT ? TURNBULL IS TRYING TO KEEP NB...,10015,negative,False
1515,1516,First he shut down @abcfactcheck. Now @Turnbul...,10015,negative,False


In [7]:
text_data = np.array([])

In [9]:
for text in df.text:
    text_data = np.append(text_data,text)

In [10]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
    no_punct = ""
    for char in sample:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [11]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = True, tokenizer=myTokenizer ,max_features = 200)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())

['#AFPRaids', '#AusPol', '#AusVotes', '#Auspol', '#Ausvotes', '#Brexit', '#Election2016', '#LNP', '#LNPfail', '#Labor', '#Medicare', '#NBN', '#Parakeelia', '#auspol', '#ausvotes', '#ausvotes2016', '#insiders', '#npc', '#qanda', '@AustralianLabor', '@LiberalAus', '@RichardDiNatale', '@TurnbullMalcolm', '@billshortenmp', 'AFP', 'ALP', 'Abbott', 'Australia', 'Australian', 'Australians', 'Bill', 'Coalition', 'Dutton', 'Government', 'Govt', 'Greens', 'How', 'If', 'Its', 'Joyce', 'LNP', 'Labor', 'Labors', 'Liberal', 'Liberals', 'Libs', 'Malcolm', 'Medicare', 'Morrison', 'NBN', 'No', 'Not', 'PM', 'Party', 'Peter', 'Shorten', 'So', 'THE', 'The', 'This', 'Turnbull', 'Turnbulls', 'VOTE', 'Vote', 'We', 'What', 'Why', 'You', 'about', 'after', 'again', 'all', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'back', 'be', 'because', 'been', 'big', 'boats', 'budget', 'business', 'but', 'by', 'campaign', 'can', 'cant', 'care', 'could', 'cut', 'cuts', 'did', 'do', 'dont', 'down', 'economic', 'economy', 'e

In [12]:
print(count.vocabulary_)

{'Turnbull': 60, 'on': 142, 'about': 68, 'amp': 72, 'want': 184, '#LNP': 7, 'has': 116, 'to': 178, 'the': 170, 'down': 99, '#auspol': 13, '#ausvotes': 14, 'VOTE': 62, 'LNP': 40, 'NBN': 49, 'THE': 57, 'he': 118, '@TurnbullMalcolm': 22, 'ALP': 25, 'of': 141, 'his': 120, '#NBN': 11, 'Shorten': 55, 'no': 138, 'for': 105, 'PM': 52, 'Turnbulls': 61, 'are': 76, 'in': 123, 'right': 159, 'our': 146, 'AFP': 24, 'Labor': 41, 'was': 186, 'Peter': 54, 'Dutton': 32, 'refugees': 158, 'jobs': 128, 'what': 188, 'own': 149, 'says': 161, '@billshortenmp': 23, 'Medicare': 47, 'that': 169, 'people': 152, 'care': 92, 'education': 102, 'health': 119, 'who': 190, '#LNPfail': 8, 'now': 140, 'business': 86, 'be': 80, 'and': 74, 'If': 37, 'Australians': 29, 'into': 124, 'how': 121, '@LiberalAus': 20, 'you': 198, 'they': 174, 'at': 78, 'not': 139, 'good': 111, 'The': 58, 'Libs': 45, 'cut': 94, 'them': 172, 'will': 192, '#insiders': 16, '#Parakeelia': 12, 'this': 176, 'cuts': 95, 'Morrison': 48, 'an': 73, 'have': 

In [13]:
X = bag_of_words.toarray()

In [14]:
#target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y,text)

In [15]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]

In [16]:
#clf = MultinomialNB()
#clf = BernoulliNB()
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0) 
model = clf.fit(X_train, y_train)

In [17]:
predicted_y = model.predict(X_test)

In [18]:
print(accuracy_score(y_test, predicted_y))

0.23


In [19]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='macro'))
print(classification_report(y_test, predicted_y,output_dict= False))

['10001' '10017' '10010' '10018' '10000' '10005' '10001' '10018' '10001'
 '10005' '10003' '10017' '10003' '10001' '10002' '10006' '10005' '10000'
 '10019' '10000' '10009' '10000' '10003' '10006' '10003' '10008' '10002'
 '10003' '10008' '10006' '10001' '10003' '10005' '10008' '10001' '10012'
 '10000' '10001' '10001' '10005' '10000' '10000' '10002' '10007' '10006'
 '10003' '10000' '10015' '10008' '10001' '10008' '10001' '10000' '10000'
 '10013' '10005' '10006' '10005' '10003' '10006' '10003' '10001' '10003'
 '10003' '10005' '10016' '10017' '10005' '10000' '10006' '10002' '10018'
 '10000' '10009' '10000' '10006' '10000' '10013' '10003' '10003' '10001'
 '10008' '10002' '10015' '10005' '10006' '10000' '10003' '10006' '10003'
 '10000' '10003' '10013' '10002' '10016' '10005' '10000' '10005' '10002'
 '10008' '10003' '10015' '10013' '10013' '10000' '10000' '10003' '10003'
 '10018' '10003' '10000' '10012' '10003' '10000' '10000' '10002' '10010'
 '10013' '10003' '10015' '10003' '10003' '10000' '1