In [1]:
import pandas as pd
import csv, sys, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree, metrics
from sklearn.utils import shuffle
from sklearn.dummy import DummyClassifier

df = pd.read_csv('dataset.tsv', sep='\t', quoting=csv.QUOTE_NONE , dtype= str, header = None, names=["instance", "text", "id", "sentiment","is_sarcastic"])

In [2]:
#df=shuffle(df)
df.head(10)

Unnamed: 0,instance,text,id,sentiment,is_sarcastic
0,1,Liberals might not be talking about #Parakeeli...,10000,negative,False
1,2,".@TurnbullMalcolm to CFA volunteers: ""Your sel...",10006,neutral,False
2,3,Peter Dutton's been “An Outstanding Immigrati...,10008,neutral,False
3,4,"If we are to respect women, we must respect S9...",10008,negative,False
4,5,#auspol @TurnbullMalcolm @Barnaby_Joyce We nee...,10000,negative,False
5,6,@ABC24 @PM. Chaotic hung Govt? Most chaos was ...,10006,negative,False
6,7,Fascist class warfare killings has only just b...,10006,negative,False
7,8,"Why the #LNP will not stand up to the banks, d...",10002,negative,False
8,9,?Turnbull couldn't find a real tradie- now con...,10001,negative,False
9,10,"While we're asking about #BorderFarce fiasco, ...",10001,negative,False


In [3]:
df['text'][2]

"Peter Dutton's been  “An Outstanding Immigration Minister” https://t.co/h42MHZERwS #auspol #immigration"

In [4]:
df.sentiment.value_counts()

negative    1294
neutral      553
positive     153
Name: sentiment, dtype: int64

In [5]:
df.id.value_counts()

10003    358
10000    244
10005    194
10006    189
10008    163
10001    140
10002    130
10015    119
10013    104
10016     59
10010     56
10019     52
10017     47
10018     38
10014     29
10012     25
10004     17
10009     16
10011     13
10007      7
Name: id, dtype: int64

In [6]:
text_data = np.array([])
for text in df.text:
    text_data = np.append(text_data,text)
text_data

array(['Liberals might not be talking about #Parakeelia MSM might not be talking about #Parakeelia Voters are very interested in #Parakeelia #auspol',
       '.@TurnbullMalcolm to CFA volunteers: "Your selflessness is being trampled on." #ausvotes #springst @abcnews https://t.co/ChVU3Lh1bV',
       "Peter Dutton's been  “An Outstanding Immigration Minister” https://t.co/h42MHZERwS #auspol #immigration",
       ...,
       'What happenned to our whistle-blower legislation??? #Auspol #LNPFail #Ausvotes https://t.co/NiKy7DHsEs',
       '#auspol  #Election2016  Turnbull too poor to face voters on #quanda @quanda   Shame on you  Rich cayman islands  millionaire',
       '@brucerossbrc What is the going rate for LNP kick-backs!? #Parakeelia #AUSvotes #AUSpol'],
      dtype='<U149')

In [10]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_punctuation(sample):
    """Remove punctuations from a sample string"""
#     punctuations = '''!"&'()*+,-./:;<=>?[\]^`{|}~'''
#     no_punct = ""
#     for char in sample:
#         if char not in punctuations:
#             no_punct = no_punct + char
#     return no_punct
    return re.sub(r'[^\w\s\&\#\@\$\%\_]','',sample)

def myPreprocessor(sample):
    """Customized preprocessor"""
    sample = remove_URL(sample)
    sample = remove_punctuation(sample)
    return sample


def myTokenizer(sample):
    """Customized tokenizer"""
    new_words = []
    words = sample.split(' ')
    new_words = [word for word in words if len(word) >= 2]
    return new_words

In [11]:
myPreprocessor(df.text[2])

'Peter Duttons been  An Outstanding Immigration Minister  #auspol #immigration'

In [12]:
myTokenizer(myPreprocessor(df.text[2]))

['Peter',
 'Duttons',
 'been',
 'An',
 'Outstanding',
 'Immigration',
 'Minister',
 '#auspol',
 '#immigration']

In [13]:
df.text[5]

'@ABC24 @PM. Chaotic hung Govt? Most chaos was by LNP .. Interrupting Q time,disgusting treatment of P Slipper, and female PM etc.#ausvotes'

In [14]:
count = CountVectorizer(preprocessor=myPreprocessor, lowercase = False, tokenizer=myTokenizer ,max_features = 200)
bag_of_words = count.fit_transform(text_data)
print(count.get_feature_names())
print(count.vocabulary_)

['#AFPRaids', '#AusPol', '#AusVotes', '#Auspol', '#Ausvotes', '#Brexit', '#Election2016', '#LNP', '#LNPfail', '#Labor', '#Medicare', '#NBN', '#Parakeelia', '#auspol', '#ausvotes', '#ausvotes2016', '#insiders', '#npc', '#qanda', '@AustralianLabor', '@LiberalAus', '@RichardDiNatale', '@TurnbullMalcolm', '@billshortenmp', 'AFP', 'ALP', 'Abbott', 'Australia', 'Australian', 'Australians', 'Bill', 'Coalition', 'Dutton', 'Government', 'Govt', 'Greens', 'How', 'If', 'Its', 'Joyce', 'LNP', 'Labor', 'Labors', 'Liberal', 'Liberals', 'Libs', 'Malcolm', 'Medicare', 'Morrison', 'NBN', 'No', 'Not', 'PM', 'Party', 'Peter', 'Shorten', 'So', 'THE', 'The', 'This', 'Turnbull', 'Turnbulls', 'VOTE', 'Vote', 'We', 'What', 'Why', 'You', 'about', 'after', 'again', 'all', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'back', 'be', 'because', 'been', 'big', 'boats', 'budget', 'business', 'but', 'by', 'campaign', 'can', 'cant', 'care', 'could', 'cut', 'cuts', 'did', 'do', 'dont', 'down', 'economic', 'economy', 'e

In [19]:
X = bag_of_words.toarray()
len(X[0])
X[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

In [21]:
#target classes
Y = np.array([])
for text in df.sentiment:
    Y = np.append(Y,text)
Y

array(['negative', 'neutral', 'neutral', ..., 'negative', 'negative',
       'negative'], dtype='<U32')

In [22]:
X_train = X[:1500]
X_test = X[1500:]
y_train = Y[:1500]
y_test = Y[1500:]

In [23]:
#clf = MultinomialNB()
#clf = BernoulliNB()
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0)
clf2 = DummyClassifier(strategy= 'most_frequent')
model = clf.fit(X_train, y_train)
model2 = clf2.fit(X_train,y_train)

In [25]:
predicted_y = model.predict(X_test)
predicted_y2 = model2.predict(X_test)
predicted_y

array(['negative', 'negative', 'neutral', 'negative', 'neutral',
       'neutral', 'negative', 'neutral', 'positive', 'neutral', 'neutral',
       'neutral', 'negative', 'negative', 'negative', 'negative',
       'negative', 'neutral', 'negative', 'neutral', 'positive',
       'negative', 'neutral', 'negative', 'neutral', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'neutral', 'positive', 'neutral', 'negative',
       'neutral', 'negative', 'negative', 'negative', 'neutral',
       'negative', 'positive', 'neutral', 'negative', 'negative',
       'neutral', 'negative', 'negative', 'negative', 'neutral',
       'neutral', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'neutral',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'neutral', 'negative', 'negative', 'neutral', 'neutral',
      

In [26]:
print(accuracy_score(y_test, predicted_y))
print(accuracy_score(y_test, predicted_y2))

0.644
0.67


In [27]:
print(y_test, predicted_y)
print(model.predict_proba(X_test))
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='micro'))
print(recall_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='micro', labels = np.unique(predicted_y)))
print(f1_score(y_test, predicted_y, average='macro', labels = np.unique(predicted_y)))
print(classification_report(y_test, predicted_y,output_dict= False, labels = np.unique(predicted_y)))

['neutral' 'negative' 'negative' 'positive' 'negative' 'positive'
 'negative' 'positive' 'negative' 'positive' 'positive' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'neutral'
 'negative' 'negative' 'negative' 'negative' 'neutral' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'neutral'
 'negative' 'negative' 'neutral' 'negative' 'negative' 'negative'
 'neutral' 'negative' 'negative' 'negative' 'neutral' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'neutral' 'negative' 'positive' 'negative'
 'positive' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'neutral' 'negative' 'negative' 'negative' 'neutral'
 'negative' 'negative' 'neutral' 'negative' 'neutral' 'neutral' 'neutral'
 'negative' 'neutral' 'neutral' 'negative' 'neutral' 'negative' 'negative'
 'negative' 'neutral' 'positive' 'negative' 'positive' 'negative'
 'negative' 'positive' 'negative' 'negative' 'negative' 'n