In [36]:
# read the data:
data_file = "training.1600000.processed.noemoticon.csv"
from csv import reader
count, pos, neg = 0, [], []

words_to_use = ["love", "hate", "aww", "bad", "good", "great", "fuck", "jesus", "sucks", "suck", "pleased", 
                "fanntastic", "amazing", "enjoy", "excellent", "day", "today", "yesterday", "dad", "mom",
                "kid", "child", "year", "think", "way", "first", "well", "even", "new", "any", "most", "man", "boy",
                "woman", "girl", "time", "person", "sad", "happy", "yes", "no", "well", "yea", "yeah", "hell", "sure",
                "ok", "wife", "husband", "kill", "ill", "sick", "illness", "sickness", "death", "virus", "dead", "killed", 
                "accident"]

with open(data_file, 'r', encoding='mac_roman') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        st = ([s.strip().lower() for s in row])
        cur_str = []
        # keep only texts that contain the above polarized words 
        # this makes the task ultra-simple but this is just for demo purposes :-) 
        found = False
        for w in st[-1].split(' '):
            if len(w) > 0:
                if w in words_to_use:
                    found = True
                    if w[0] not in ['@']:
                        cur_str.append(w.replace("!", "").replace("?", "").replace(".", ""))
        if found:
            if (st[0]) == '0':
                neg.append(" ".join(cur_str))
            if (st[0]) == '4':
                pos.append(" ".join(cur_str))

In [37]:
# extract term frequencies
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words={'english'})
# X contains n_documents rows and n_terms columns
X = vectorizer.fit_transform(neg + pos).toarray()
# list of terms
train_words = (vectorizer.get_feature_names())
# prior probabilities:
prior_1 = len(neg) / (len(pos) + len(neg))
prior_2 = len(pos) / (len(pos) + len(neg))
# get average frequencies per class --> this is actually the p(x_term | y) for each term and class
x1 = vectorizer.transform(neg)
p_x_y1 = x1.mean(axis=0).flatten()
x2 = vectorizer.transform(pos)
p_x_y2 = x2.mean(axis=0).flatten()

In [56]:
# print terms that are "almost" neutral, i.e. p(x_term | y = 0) / p(x_term | y = 1) is close to 1
thres = 1.5
print("\nNeutral words")
for iw, w in enumerate(train_words):
    if p_x_y1[0, iw] / p_x_y2[0, iw] < thres and p_x_y2[0, iw] / p_x_y1[0, iw] < thres:
        print(f'{w} \t p({w}|y=negative)={p_x_y1[0, iw]:.5f} \t p({w}|y=positive)={p_x_y2[0, iw]:.5f}')
print("\nNegative words")
for iw, w in enumerate(train_words):
    if p_x_y1[0, iw] / p_x_y2[0, iw] > thres:
        print(f'{w} \t p({w}|y=negative)={p_x_y1[0, iw]:.5f} \t p({w}|y=positive)={p_x_y2[0, iw]:.5f}')
print("\nPositive words")
for iw, w in enumerate(train_words):
    if p_x_y2[0, iw] / p_x_y1[0, iw] > thres:
        print(f'{w} \t p({w}|y=negative)={p_x_y1[0, iw]:.5f} \t p({w}|y=positive)={p_x_y2[0, iw]:.5f}')


Neutral words
any 	 p(any|y=negative)=0.02903 	 p(any|y=positive)=0.01970
boy 	 p(boy|y=negative)=0.00537 	 p(boy|y=positive)=0.00531
child 	 p(child|y=negative)=0.00129 	 p(child|y=positive)=0.00091
day 	 p(day|y=negative)=0.09433 	 p(day|y=positive)=0.10498
first 	 p(first|y=negative)=0.02129 	 p(first|y=positive)=0.02914
girl 	 p(girl|y=negative)=0.01017 	 p(girl|y=positive)=0.01224
husband 	 p(husband|y=negative)=0.00204 	 p(husband|y=positive)=0.00185
ill 	 p(ill|y=negative)=0.01121 	 p(ill|y=positive)=0.00843
kid 	 p(kid|y=negative)=0.00263 	 p(kid|y=positive)=0.00257
man 	 p(man|y=negative)=0.01480 	 p(man|y=positive)=0.01112
mom 	 p(mom|y=negative)=0.01220 	 p(mom|y=positive)=0.00934
most 	 p(most|y=negative)=0.01112 	 p(most|y=positive)=0.01368
ok 	 p(ok|y=negative)=0.01333 	 p(ok|y=positive)=0.01620
person 	 p(person|y=negative)=0.00553 	 p(person|y=positive)=0.00548
think 	 p(think|y=negative)=0.06895 	 p(think|y=positive)=0.05996
time 	 p(time|y=negative)=0.06939 	 p(time|

In [39]:
def classify(st, f_class1, f_class2, vocabulary, prior1, prior2):
    words = st.split(' ')
    prob1, prob2 = prior1, prior2
    for w in words:
        w2 = w
        w2 = w2.replace("!", "").replace("?", "").replace(".", "").replace(",", "")
        if w2 in train_words:
            prob1 *= f_class1[0, vocabulary.index(w2)]
            prob2 *= f_class2[0, vocabulary.index(w2)]
    probs = np.array([prob1, prob2])
    probs /= probs.sum()
    return probs

In [44]:
p = classify("ok", p_x_y1, p_x_y2, train_words, prior_1, prior_2)
print(np.argmax(p), p[np.argmax(p)])

1 0.5488460676674849


In [45]:
p = classify("i am so sad this is bad news", p_x_y1, p_x_y2, train_words, prior_1, prior_2)
print(np.argmax(p), p[np.argmax(p)])

0 0.9886301550999419


In [46]:
p = classify("this is fucking great. this is the most amazing news i've heard in a while, i am so happy", p_x_y1, p_x_y2, train_words, prior_1, prior_2)
print(np.argmax(p), p[np.argmax(p)])

1 0.9804085295673374


In [47]:
p = classify("daskhj adsjkh ", p_x_y1, p_x_y2, train_words, prior_1, prior_2)
print(np.argmax(p), p[np.argmax(p)])

1 0.5003309143494389
