In [195]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None)

In [196]:
df=shuffle(df)
df.head(3)

Unnamed: 0,0,1,2,3,4
1714,1715,The Libs complaining about a Medicare scare ca...,10005,negative,False
1608,1609,Labor message to Asylum Seekers 'Come on in' #...,10008,neutral,False
1623,1624,@Greens @RichardDiNatale the economy of Tasman...,10003,negative,False


In [197]:
text_data = np.array([])

In [198]:
for text in df[1]:
    text_data = np.append(text_data,text)

In [199]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
    punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
    no_punct = ""
    for char in sample:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct


def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [215]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = True, tokenizer=myTokenizer ,max_features = 200)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())

['#AFPRaids', '#AusPol', '#AusVotes', '#Auspol', '#Ausvotes', '#Brexit', '#Election2016', '#LNP', '#LNPfail', '#Labor', '#Medicare', '#NBN', '#Parakeelia', '#auspol', '#ausvotes', '#ausvotes2016', '#insiders', '#npc', '#qanda', '@AustralianLabor', '@LiberalAus', '@RichardDiNatale', '@TurnbullMalcolm', '@billshortenmp', 'AFP', 'ALP', 'Abbott', 'Australia', 'Australian', 'Australians', 'Bill', 'Coalition', 'Dutton', 'Government', 'Govt', 'Greens', 'How', 'If', 'Its', 'Joyce', 'LNP', 'Labor', 'Labors', 'Liberal', 'Liberals', 'Libs', 'Malcolm', 'Medicare', 'Morrison', 'NBN', 'No', 'Not', 'PM', 'Party', 'Peter', 'Shorten', 'So', 'THE', 'The', 'This', 'Turnbull', 'Turnbulls', 'VOTE', 'Vote', 'We', 'What', 'Why', 'You', 'about', 'after', 'again', 'all', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'back', 'be', 'because', 'been', 'big', 'boats', 'budget', 'business', 'but', 'by', 'campaign', 'can', 'cant', 'care', 'could', 'cut', 'cuts', 'did', 'do', 'dont', 'down', 'economic', 'economy', 'e

In [216]:
print(count.vocabulary_)

{'The': 58, 'Libs': 45, 'about': 68, 'Medicare': 47, 'campaign': 89, 'is': 125, 'like': 133, 'Turnbull': 60, '#ausvotes': 14, 'Labor': 41, 'to': 178, 'on': 142, 'in': 123, '#auspol': 13, '@RichardDiNatale': 21, 'the': 170, 'economy': 101, 'of': 141, 'what': 188, 'when': 189, 'have': 117, 'THE': 57, 'VOTE': 62, 'LNP': 40, 'up': 180, 'how': 121, 'boats': 84, 'why': 191, 'Abbott': 26, 'was': 186, 'by': 88, 'says': 161, 'and': 74, 'with': 193, 'economic': 100, '#npc': 17, 'Joyce': 39, 'support': 164, 'for': 105, 'any': 75, '#Auspol': 3, 'its': 127, 'will': 192, 'it': 126, 'people': 152, 'his': 120, 'need': 137, 'put': 157, 'last': 132, 'Liberals': 44, 'from': 106, 'after': 69, 'amp': 72, '@LiberalAus': 20, 'plans': 154, 'do': 97, 'you': 198, 'our': 146, 'Australians': 29, 'want': 184, 'an': 73, 'You': 67, 'cant': 91, '#Election2016': 6, 'vote': 183, 'their': 171, 'Greens': 35, 'dont': 98, 'Malcolm': 46, 'plebiscite': 155, 'this': 176, 'What': 65, 'cut': 94, 'via': 182, 'out': 147, 'budget'

In [217]:
X = bag_of_words.toarray()

In [218]:
#target classes
Y = np.array([])
for text in df[2]:
    Y = np.append(Y,text)

In [219]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]

In [220]:
#clf = MultinomialNB()
#clf = BernoulliNB()
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0) 
model = clf.fit(X_train, y_train)

In [221]:
predicted_y = model.predict(X_test)

In [222]:
print(accuracy_score(y_test, predicted_y))

0.236


In [208]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='macro'))
print(classification_report(y_test, predicted_y,output_dict= False))

['10008' '10018' '10013' '10005' '10002' '10007' '10014' '10000' '10009'
 '10011' '10013' '10015' '10008' '10002' '10002' '10016' '10002' '10019'
 '10010' '10015' '10008' '10005' '10006' '10006' '10005' '10013' '10005'
 '10005' '10000' '10016' '10000' '10000' '10005' '10005' '10003' '10008'
 '10006' '10007' '10005' '10003' '10003' '10006' '10003' '10000' '10000'
 '10003' '10004' '10016' '10003' '10003' '10002' '10015' '10013' '10008'
 '10000' '10001' '10006' '10000' '10012' '10006' '10001' '10003' '10008'
 '10008' '10008' '10004' '10005' '10015' '10015' '10003' '10003' '10006'
 '10000' '10014' '10003' '10003' '10003' '10000' '10014' '10005' '10008'
 '10008' '10013' '10002' '10002' '10003' '10000' '10013' '10006' '10000'
 '10006' '10005' '10000' '10000' '10006' '10003' '10000' '10001' '10003'
 '10000' '10001' '10009' '10003' '10016' '10000' '10008' '10015' '10006'
 '10001' '10003' '10008' '10005' '10000' '10000' '10006' '10000' '10010'
 '10006' '10000' '10001' '10008' '10015' '10001' '1

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
