In [129]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle
from sklearn.dummy import DummyClassifier

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None, names=["instance", "text", "id", "sentiment","is_sarcastic"])

In [130]:
df=shuffle(df)

In [131]:
text_data = np.array([])

In [132]:
for text in df.text:
    text_data = np.append(text_data,text)

In [133]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
    no_punct = ""
    for char in sample:
        if char not in punctuations:
            no_punct = no_punct + char
        else:
            no_punct = no_punct + " "
    return no_punct


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [134]:
myPreprocessor(df.text[5])

'@ABC24 @PM  Chaotic hung Govt  Most chaos was by LNP    Interrupting Q time disgusting treatment of P Slipper  and female PM etc #ausvotes'

In [135]:
myTokenizer(myPreprocessor(df.text[5]))

['@ABC24',
 '@PM',
 'Chaotic',
 'hung',
 'Govt',
 'Most',
 'chaos',
 'was',
 'by',
 'LNP',
 'Interrupting',
 'time',
 'disgusting',
 'treatment',
 'of',
 'Slipper',
 'and',
 'female',
 'PM',
 'etc',
 '#ausvotes']

In [136]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = False, tokenizer=myTokenizer ,max_features = None)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())



In [137]:
print(count.vocabulary_)



In [138]:
X = bag_of_words.toarray()
print(df.text[2])
print(X[1])

Peter Dutton's been  “An Outstanding Immigration Minister” https://t.co/h42MHZERwS #auspol #immigration
[0 0 0 ... 0 0 0]


In [139]:
#target classes
Y = np.array([])
for text in df.id:
    Y = np.append(Y,text)

In [140]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]
print(X_train[0])
print(y_train[0])

[0 0 0 ... 0 0 0]
10001


In [141]:
#clf = MultinomialNB()
clf = BernoulliNB()
#clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0) 
model = clf.fit(X_train, y_train)
clf2 = DummyClassifier(strategy= 'most_frequent')
model2 = clf2.fit(X_train,y_train)

In [142]:
predicted_y = model.predict(X_test)
predicted_y2 = model2.predict(X_test)

In [147]:
print(accuracy_score(y_test, predicted_y))
print(accuracy_score(y_test, predicted_y2))

0.192
0.186


In [149]:
y_test

array(['10018', '10015', '10013', '10005', '10000', '10018', '10003',
       '10000', '10008', '10013', '10003', '10006', '10003', '10005',
       '10000', '10013', '10003', '10005', '10005', '10000', '10003',
       '10000', '10003', '10003', '10002', '10017', '10003', '10000',
       '10008', '10003', '10003', '10015', '10015', '10002', '10003',
       '10008', '10001', '10001', '10001', '10011', '10005', '10006',
       '10003', '10003', '10005', '10002', '10019', '10017', '10005',
       '10000', '10008', '10016', '10018', '10015', '10016', '10000',
       '10008', '10006', '10016', '10002', '10008', '10003', '10003',
       '10019', '10009', '10003', '10000', '10008', '10010', '10002',
       '10015', '10008', '10005', '10001', '10003', '10008', '10005',
       '10008', '10001', '10000', '10005', '10002', '10003', '10005',
       '10006', '10006', '10010', '10019', '10003', '10015', '10010',
       '10003', '10017', '10014', '10001', '10006', '10017', '10003',
       '10000', '100

In [144]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro', labels = np.unique(predicted_y)))
print(f1_score(y_test, predicted_y, average='macro', labels = np.unique(predicted_y)))
print(classification_report(y_test, predicted_y,output_dict= False, labels = np.unique(predicted_y)))

['10018' '10015' '10013' '10005' '10000' '10018' '10003' '10000' '10008'
 '10013' '10003' '10006' '10003' '10005' '10000' '10013' '10003' '10005'
 '10005' '10000' '10003' '10000' '10003' '10003' '10002' '10017' '10003'
 '10000' '10008' '10003' '10003' '10015' '10015' '10002' '10003' '10008'
 '10001' '10001' '10001' '10011' '10005' '10006' '10003' '10003' '10005'
 '10002' '10019' '10017' '10005' '10000' '10008' '10016' '10018' '10015'
 '10016' '10000' '10008' '10006' '10016' '10002' '10008' '10003' '10003'
 '10019' '10009' '10003' '10000' '10008' '10010' '10002' '10015' '10008'
 '10005' '10001' '10003' '10008' '10005' '10008' '10001' '10000' '10005'
 '10002' '10003' '10005' '10006' '10006' '10010' '10019' '10003' '10015'
 '10010' '10003' '10017' '10014' '10001' '10006' '10017' '10003' '10000'
 '10001' '10019' '10006' '10006' '10001' '10015' '10002' '10000' '10006'
 '10005' '10018' '10003' '10012' '10001' '10005' '10002' '10008' '10019'
 '10000' '10003' '10002' '10010' '10003' '10000' '1

In [145]:
type(model.predict_proba(X_test))

numpy.ndarray

In [146]:
model.predict_proba(X_test)[4]

array([2.56759020e-004, 1.68953296e-015, 1.24220887e-016, 9.99742852e-001,
       4.20588103e-167, 2.52998807e-007, 1.35995449e-007, 0.00000000e+000,
       7.22565181e-011, 2.41872211e-179, 9.07558568e-051, 5.25183103e-255,
       4.55446596e-138, 5.23802464e-020, 1.82700822e-091, 5.78217925e-017,
       1.50821844e-043, 1.88496592e-057, 1.08909920e-079, 1.37555026e-054])