In [40]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None, names=["instance", "text", "id", "sentiment","is_sarcastic"])

In [41]:
df=shuffle(df)
df.head(3)
df.id.value_counts()

10003    358
10000    244
10005    194
10006    189
10008    163
10001    140
10002    130
10015    119
10013    104
10016     59
10010     56
10019     52
10017     47
10018     38
10014     29
10012     25
10004     17
10009     16
10011     13
10007      7
Name: id, dtype: int64

In [42]:
text_data = np.array([])

In [43]:
for text in df.text:
    text_data = np.append(text_data,text)

In [44]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
    no_punct = ""
    for char in sample:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [45]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = True, tokenizer=myTokenizer ,max_features = 200)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())

['#AFPRaids', '#AusPol', '#AusVotes', '#Auspol', '#Ausvotes', '#Brexit', '#Election2016', '#LNP', '#LNPfail', '#Labor', '#Medicare', '#NBN', '#Parakeelia', '#auspol', '#ausvotes', '#ausvotes2016', '#insiders', '#npc', '#qanda', '@AustralianLabor', '@LiberalAus', '@RichardDiNatale', '@TurnbullMalcolm', '@billshortenmp', 'AFP', 'ALP', 'Abbott', 'Australia', 'Australian', 'Australians', 'Bill', 'Coalition', 'Dutton', 'Government', 'Govt', 'Greens', 'How', 'If', 'Its', 'Joyce', 'LNP', 'Labor', 'Labors', 'Liberal', 'Liberals', 'Libs', 'Malcolm', 'Medicare', 'Morrison', 'NBN', 'No', 'Not', 'PM', 'Party', 'Peter', 'Shorten', 'So', 'THE', 'The', 'This', 'Turnbull', 'Turnbulls', 'VOTE', 'Vote', 'We', 'What', 'Why', 'You', 'about', 'after', 'again', 'all', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'back', 'be', 'because', 'been', 'big', 'boats', 'budget', 'business', 'but', 'by', 'campaign', 'can', 'cant', 'care', 'could', 'cut', 'cuts', 'did', 'do', 'dont', 'down', 'economic', 'economy', 'e

In [46]:
print(count.vocabulary_)

{'If': 37, 'who': 190, 'his': 120, 'from': 106, 'Bill': 30, 'Shorten': 55, 'amp': 72, 'its': 127, 'with': 193, 'own': 149, 'wants': 185, 'to': 178, 'it': 126, '#auspol': 13, '@AustralianLabor': 19, 'have': 117, 'an': 73, 'about': 68, 'budget': 85, '#ausvotes': 14, '#Election2016': 6, 'again': 70, 'up': 180, 'on': 142, 'and': 74, 'pay': 151, 'The': 58, 'will': 192, 'be': 80, 'Australians': 29, '@LiberalAus': 20, 'no': 138, 'funding': 107, 'the': 170, 'Medicare': 47, 'take': 165, 'over': 148, 'by': 88, 'LNP': 40, 'Liberals': 44, 'not': 139, '#Parakeelia': 12, 'are': 76, 'in': 123, 'you': 198, 'need': 137, 'know': 130, 'just': 129, 'your': 199, 'Its': 38, 'of': 141, '#LNP': 7, 'tax': 166, 'cuts': 95, 'what': 188, 'this': 176, 'says': 161, 'is': 125, 'plan': 153, 'would': 197, '#NBN': 11, 'back': 79, 'Not': 51, 'for': 105, 'if': 122, '@TurnbullMalcolm': 22, '@billshortenmp': 23, 'good': 111, 'than': 168, 'that': 169, 'Joyce': 39, 'now': 140, 'We': 64, 'was': 186, 'under': 179, 'Labor': 41,

In [47]:
X = bag_of_words.toarray()

In [48]:
#target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y,text)

In [49]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]

In [50]:
#clf = MultinomialNB()
#clf = BernoulliNB()
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0) 
model = clf.fit(X_train, y_train)

In [51]:
predicted_y = model.predict(X_test)

In [52]:
print(accuracy_score(y_test, predicted_y))

0.27


In [53]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro', labels = np.unique(predicted_y)))
print(f1_score(y_test, predicted_y, average='macro', labels = np.unique(predicted_y)))
print(classification_report(y_test, predicted_y,output_dict= False, labels = np.unique(predicted_y)))

['10003' '10003' '10006' '10000' '10013' '10002' '10008' '10002' '10006'
 '10003' '10000' '10000' '10006' '10001' '10003' '10003' '10013' '10003'
 '10008' '10013' '10001' '10006' '10003' '10005' '10006' '10013' '10003'
 '10006' '10006' '10003' '10000' '10014' '10003' '10002' '10008' '10002'
 '10003' '10018' '10009' '10008' '10009' '10015' '10006' '10013' '10001'
 '10015' '10000' '10017' '10001' '10008' '10014' '10015' '10005' '10012'
 '10015' '10010' '10002' '10005' '10003' '10008' '10016' '10003' '10003'
 '10019' '10015' '10005' '10005' '10006' '10015' '10001' '10006' '10003'
 '10003' '10003' '10005' '10018' '10004' '10017' '10008' '10004' '10003'
 '10006' '10003' '10015' '10006' '10003' '10000' '10017' '10001' '10010'
 '10003' '10005' '10002' '10003' '10006' '10000' '10012' '10015' '10016'
 '10003' '10003' '10005' '10017' '10007' '10000' '10016' '10001' '10003'
 '10006' '10019' '10003' '10008' '10006' '10003' '10006' '10009' '10015'
 '10016' '10003' '10015' '10000' '10018' '10016' '1