In [102]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None, names=["instance", "text", "id", "sentiment","is_sarcastic"])

In [103]:
df=shuffle(df)

In [104]:
text_data = np.array([])

In [105]:
for text in df.text:
    text_data = np.append(text_data,text)

In [106]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
    no_punct = ""
    for char in sample:
        if char not in punctuations:
            no_punct = no_punct + char
        else:
            no_punct = no_punct + " "
    return no_punct


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [107]:
myPreprocessor(df.text[5])

'@ABC24 @PM  Chaotic hung Govt  Most chaos was by LNP    Interrupting Q time disgusting treatment of P Slipper  and female PM etc #ausvotes'

In [108]:
myTokenizer(myPreprocessor(df.text[5]))

['@ABC24',
 '@PM',
 'Chaotic',
 'hung',
 'Govt',
 'Most',
 'chaos',
 'was',
 'by',
 'LNP',
 'Interrupting',
 'time',
 'disgusting',
 'treatment',
 'of',
 'Slipper',
 'and',
 'female',
 'PM',
 'etc',
 '#ausvotes']

In [120]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = False, tokenizer=myTokenizer ,max_features = None)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())



In [121]:
print(count.vocabulary_)



In [122]:
X = bag_of_words.toarray()
print(df.text[2])
print(X[1])

Peter Dutton's been  “An Outstanding Immigration Minister” https://t.co/h42MHZERwS #auspol #immigration
[0 0 0 ... 0 0 0]


In [123]:
#target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y,text)

In [124]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]
print(X_train[0])
print(y_train[0])

[0 0 0 ... 0 0 0]
10005


In [125]:
#clf = MultinomialNB()
clf = BernoulliNB()
#clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0) 
model = clf.fit(X_train, y_train)

In [126]:
predicted_y = model.predict(X_test)

In [127]:
print(accuracy_score(y_test, predicted_y))

0.222


In [128]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro', labels = np.unique(predicted_y)))
print(f1_score(y_test, predicted_y, average='macro', labels = np.unique(predicted_y)))
print(classification_report(y_test, predicted_y,output_dict= False, labels = np.unique(predicted_y)))

['10008' '10012' '10017' '10017' '10002' '10016' '10005' '10001' '10008'
 '10005' '10015' '10002' '10003' '10008' '10013' '10006' '10003' '10003'
 '10019' '10000' '10002' '10003' '10000' '10002' '10018' '10003' '10016'
 '10002' '10013' '10005' '10003' '10003' '10003' '10005' '10003' '10011'
 '10008' '10002' '10003' '10008' '10000' '10006' '10010' '10003' '10005'
 '10015' '10013' '10017' '10000' '10001' '10008' '10006' '10005' '10003'
 '10005' '10002' '10003' '10014' '10005' '10000' '10006' '10003' '10018'
 '10003' '10014' '10003' '10005' '10010' '10003' '10001' '10005' '10005'
 '10001' '10006' '10016' '10008' '10005' '10006' '10006' '10002' '10003'
 '10001' '10003' '10010' '10002' '10006' '10018' '10002' '10008' '10015'
 '10008' '10005' '10000' '10000' '10003' '10001' '10003' '10002' '10008'
 '10015' '10019' '10000' '10005' '10003' '10013' '10003' '10003' '10000'
 '10008' '10017' '10003' '10005' '10000' '10000' '10009' '10001' '10003'
 '10003' '10002' '10003' '10000' '10010' '10010' '1

In [118]:
type(model.predict_proba(X_test))

numpy.ndarray

In [119]:
model.predict_proba(X_test)[4]

array([4.77057741e-01, 2.20977133e-02, 2.97741772e-02, 1.67907347e-01,
       1.00491937e-06, 6.59815214e-02, 1.64310357e-01, 1.59381668e-11,
       2.35877791e-02, 9.48154630e-08, 9.89387036e-04, 7.61315225e-09,
       1.42227379e-04, 2.70467462e-02, 8.64424032e-05, 6.36621977e-03,
       6.10463663e-03, 5.03213397e-04, 5.49815865e-04, 7.49356773e-03])