In [1]:
import pandas as pd
import math
import string
from collections import Counter
import copy
import re
from nltk.stem.porter import PorterStemmer
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

# nltk.download('stopwords')
# from nltk.corpus import stopwords 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shiyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# utility func

# preprocess text
def preprocess_text(text_str):
    # remove punctuation and numbers
    removed_punc_text = re.sub('[^A-Za-z]', ' ', text_str)
    removed_punc_text_list = removed_punc_text.split()
#     print(len(removed_punc_text_list)) #test

    # remove stop words
#     stop_words = set(stopwords.words('english'))  
#     removed_punc_stop_text_list = [w for w in removed_punc_text_list if not w in stop_words]  
    # print(len(removed_punc_stop_text_list)) #test

    # stemming (base words)
    stemmer = PorterStemmer()
    for i in range(len(removed_punc_text_list)):
        removed_punc_text_list[i] = stemmer.stem(removed_punc_text_list[i])
    return removed_punc_text_list



# calculate probability of each word in specific class
def getProbDic(vocalbulary_dic, class_dic, total_words_num_in_class, vocabulary_length):
    smooth_num = 0.01
    word_prob = {} # key: word, value: probability of word, log10 format
    for key, val in vocalbulary_dic.items():
        fre = class_dic.get(key)
        prob = 0 
        # if word doesn't exist in yes_class_vocabulary, add 0.01 smooth
        if fre == None:
            prob = smooth_num / (total_words_num_in_class + vocabulary_length * smooth_num)
        else:
            prob = fre + smooth_num / (total_words_num_in_class + vocabulary_length * smooth_num)
        word_prob[key] = math.log10(prob)
        #print(word_prob[key])
    return word_prob

def cal_total_words(class_dic):
    total_words = 0
    for key, val in class_dic.items():
        total_words += val
    return total_words

def get_score(prob_class, class_dic, document_list):
    score = prob_class
    for word in document_list:
        if class_dic.get(word) != None:
            score += class_dic.get(word)
    return score

In [3]:
# read specific columns from csv file
df = pd.read_csv("covid_training.csv", usecols = ['tweet_id','text', 'q1_label'])
# print(df)

# count the number of documents in each class (yes/no)
# read text in each row of training set
yes_class_num = 0
no_class_num = 0
total_class_num = 0

yes_class_text = ""
no_class_text = ""
all_text = "" # concatenate all texts to one string

for i, col in df.iterrows():
    if col['q1_label'].lower() == "yes":
        yes_class_num += 1
        yes_class_text += (col['text'].lower() + " ")
    else:
        no_class_num += 1
        no_class_text += (col['text'].lower() + " ")
    all_text += (col['text'].lower() + " ")
    total_class_num += 1

# print(all_text) #test

# probability of each class
prob_class_yes = math.log10(yes_class_num / total_class_num)
prob_class_no = math.log10(no_class_num / total_class_num)

print(yes_class_num)
print(no_class_num)
########################## preprocess all_text

# map punctuation to space
# translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

# generate vocabulary 
# count words frequency in all_text
# remove punctuation and quotation marks
# removed_punc_text = all_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
all_text_list = preprocess_text(all_text)




################# original_vocabulary #################

# calculate words frequency for original_vocabulary
original_vocabulary = dict(Counter(all_text_list))
# print(original_vocabulary)

# count total vocabulary words in all_text based on original vocabulary
abs_ov_length = len(original_vocabulary)
# print(abs_ov_length)

# test
# print(original_vocabulary.get("the")) # should be None

# test
abs_ov_length_dupli = cal_total_words(original_vocabulary)
# print(abs_ov_length_dupli)




# count words frequency in yes_class_text based on original_vocabulary
# removed_punc_yes_class_text = yes_class_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
yes_class_text_list = preprocess_text(yes_class_text)

# calculate words frequency
yes_class_vocabulary = dict(Counter(yes_class_text_list))
# print(yes_class_vocabulary)

total_words_in_yes_class_text_ov = cal_total_words(yes_class_vocabulary)
# print(total_words_in_yes_class_text_ov)

# calculate probability of each word in class yes
word_prob_yes_class = getProbDic(original_vocabulary, yes_class_vocabulary, total_words_in_yes_class_text_ov, abs_ov_length)
# print(word_prob_yes_class["the"]) # test



# count words frequency in no_class_text based on original_vocabulary
# removed_punc_no_class_text = no_class_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
no_class_text_list = preprocess_text(no_class_text)

# calculate words frequency
no_class_vocabulary = dict(Counter(no_class_text_list))
# print(no_class_vocabulary)

total_words_in_no_class_text_ov = cal_total_words(no_class_vocabulary)
# print(total_words_in_no_class_text_ov)

# calculate probability of each word in class no
word_prob_no_class = getProbDic(original_vocabulary, no_class_vocabulary, total_words_in_no_class_text_ov, abs_ov_length)
# print(word_prob_no_class["the"]) # test



247
152


In [4]:
################# filtered_vocabulary #################
# calculate words frequency for filtered_vocabulary
filtered_vocabulary = {key:val for key, val in original_vocabulary.items() if val != 1}
print(filtered_vocabulary)

# count total vocabulary words in all_text based on filtered vocabulary
abs_fv_length = len(filtered_vocabulary)

# test
abs_fv_length_dupli = cal_total_words(filtered_vocabulary)
print(abs_fv_length_dupli)

# calculate class_dic based on filtered_vocabulary
yes_class_vocabulary_fv = copy.deepcopy(yes_class_vocabulary)
for key in yes_class_vocabulary.keys():
    # remove all words that doesn't exist in filtered_vocabulary
    if filtered_vocabulary.get(key) == None:
        del yes_class_vocabulary_fv[key]


total_words_in_yes_class_text_fv = cal_total_words(yes_class_vocabulary_fv)
# print(total_words_in_yes_class_text_fv)

# calculate probability of each word in class yes
word_prob_yes_class_fv = getProbDic(filtered_vocabulary, yes_class_vocabulary_fv, total_words_in_yes_class_text_fv, abs_fv_length)



# calculate class_dic based on filtered_vocabulary
no_class_vocabulary_fv = copy.deepcopy(no_class_vocabulary)
for key in no_class_vocabulary.keys():
    # remove all words that doesn't exist in filtered_vocabulary
    if filtered_vocabulary.get(key) == None:
        del no_class_vocabulary_fv[key]


total_words_in_no_class_text_fv = cal_total_words(no_class_vocabulary_fv)
# print(total_words_in_no_class_text_fv)

# calculate probability of each word in class no
word_prob_no_class_fv = getProbDic(filtered_vocabulary, no_class_vocabulary_fv, total_words_in_no_class_text_fv, abs_fv_length)



{'for': 125, 'the': 461, 'american': 15, 'best': 6, 'way': 13, 'to': 338, 'tell': 9, 'if': 43, 'you': 120, 'have': 66, 'covid': 183, 'is': 204, 'cough': 7, 'in': 187, 'a': 279, 'person': 13, 's': 122, 'face': 8, 'and': 228, 'wait': 4, 'their': 28, 'test': 60, 'result': 4, 'thi': 128, 'fuck': 12, 'can': 49, 'y': 16, 'all': 55, 'pleas': 26, 'just': 38, 'follow': 10, 'govern': 18, 'instruct': 2, 'so': 28, 'we': 81, 'knock': 2, 'out': 26, 'be': 92, 'done': 6, 'i': 116, 'feel': 8, 'like': 34, 'that': 117, 'keep': 7, 'lose': 3, 'more': 20, 'time': 27, 'becaus': 20, 'one': 24, 'or': 34, 'two': 10, 'kid': 5, 't': 275, 'direct': 3, 'no': 32, 'but': 41, 'corona': 91, 'viru': 85, 'disappear': 2, 'befor': 5, 'april': 2, 'actual': 7, 'suck': 2, 'of': 231, 'someon': 7, 'who': 43, 'spent': 3, 'hour': 2, 'protect': 12, 'move': 2, 'critic': 4, 'ill': 7, 'patient': 18, 'around': 9, 'are': 77, 'onli': 17, 'at': 53, 'start': 9, 'am': 8, 'peopl': 62, 'do': 32, 'social': 9, 'distanc': 2, 'self': 10, 'isol':

In [5]:
################# original_vocabulary #################

# read specific columns from csv file
df_test = pd.read_csv("covid_test_public.csv", usecols = ['tweet_id','text', 'q1_label'])

total_instances = 0
num_of_correct_prediction = 0
num_of_wrong_prediction = 0
num_of_yes_predicted = 0
num_of_no_predicted = 0
positive_yes = 0
positive_no = 0
true_yes = 0
true_no = 0


# generate trace file based on model
f = open("_NB-BOW-OV.txt","w") 
for i, col in df_test.iterrows():
    f.write(str(col['tweet_id'])+ "  ") 

    # calculate the score in each class
    document = col['text'].lower()
    word_list = preprocess_text(document)
    total_instances += 1
    # print(word_list, "\n")

    yes_score = get_score(prob_class_yes, word_prob_yes_class, word_list)
    no_score = get_score(prob_class_no, word_prob_no_class, word_list)
    print("yes_score", yes_score)
    print("no_score", no_score)

    if yes_score > no_score:
        num_of_yes_predicted += 1
        f.write("yes  "+ '{:.5E}'.format(yes_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "yes":
            f.write("correct\n")
            num_of_correct_prediction += 1
            positive_yes += 1
            true_yes +=1
        else:
            f.write("wrong\n")
            num_of_wrong_prediction += 1
            true_no += 1
    else:
        num_of_no_predicted += 1
        f.write("no  "+ '{:.5E}'.format(no_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "no":
            f.write("correct\n")
            num_of_correct_prediction += 1
            positive_no += 1
            true_no += 1
        else:
            f.write("wrong\n")
            num_of_wrong_prediction += 1
            true_yes += 1

f.close()

# print(total_instances)
# print(num_of_correct_prediction)
# print(num_of_wrong_prediction)
# print(num_of_yes_predicted)
# print(num_of_no_predicted)
# print(" ")
# print(positive_yes)
# print(positive_no)
# print(" ")
# print(true_yes)
# print(true_no)


yes_score 50.76936429128407
no_score 22.007053941359086
yes_score 27.096460943270895
no_score 8.818093223773866
yes_score 63.076108958394336
no_score 16.84303550676065
yes_score 27.717620484658678
no_score -15.157223130234389
yes_score 29.07271844963968
no_score -4.745997818985202
yes_score 28.77961663251946
no_score 14.730287692523168
yes_score 20.09783331813091
no_score -5.423973736058697
yes_score 53.30029722082114
no_score 4.563222865601342
yes_score 38.44687400285244
no_score 16.564038513660467
yes_score 21.724014337543313
no_score 13.495268955542207
yes_score 22.310439221097624
no_score 3.2146343330615688
yes_score 10.194101321461817
no_score 7.922919980481746
yes_score 71.3501505966141
no_score 39.00457824459602
yes_score 26.44591075546791
no_score 1.082515834298138
yes_score 5.67661371703193
no_score -1.2429026192451256
yes_score 67.57240398625967
no_score -6.88071325628575
yes_score 19.46766377516145
no_score -1.327228659236611
yes_score 41.82584147209753
no_score -8.575755101

In [6]:
# Outputting evaluation file for original vocabulary

accuracy = round(num_of_correct_prediction/total_instances,4)
precision_yes = round(positive_yes/num_of_yes_predicted,4)
precision_no = round(positive_no/num_of_no_predicted,4)
recall_yes = round(positive_yes/true_yes,4)
recall_no = round(positive_no/true_no,4)
f1_yes = round(((1**2+1)*precision_yes*recall_yes)/(1**2*precision_yes+recall_yes),4)
f1_no = round(((1**2+1)*precision_no*recall_no)/(1**2*precision_no+recall_no),4)

eval = open("eval_NB-BOW-OV.txt","w")
eval.write(str(accuracy) + "\n")
eval.write(str(precision_yes) + "  " +str(precision_no)+"\n")
eval.write(str(recall_yes) + "  " +str(recall_no)+"\n")
eval.write(str(f1_yes) + "  " +str(f1_no)+"\n")
eval.close()

In [43]:
################# filtered_vocabulary #################

total_instances = 0
num_of_correct_prediction = 0
num_of_wrong_prediction = 0
num_of_yes_predicted = 0
num_of_no_predicted = 0
positive_yes = 0
positive_no = 0
true_yes = 0
true_no = 0

# generate trace file based on model
f2 = open("_NB-BOW-FV.txt","w") 
for i, col in df_test.iterrows():
    f2.write(str(col['tweet_id'])+ "  ") 

    # calculate the score in each class
    document = col['text'].lower()
    word_list = preprocess_text(document)
    # print(word_list, "\n")

    yes_score = get_score(prob_class_yes, word_prob_yes_class_fv, word_list)
    no_score = get_score(prob_class_no, word_prob_no_class_fv, word_list)
    print("yes_score", yes_score)
    print("no_score", no_score)

    if yes_score > no_score:
        num_of_yes_predicted += 1
        f2.write("yes  "+ '{:.5E}'.format(yes_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "yes":
            f2.write("correct\n")
        else:
            f2.write("wrong\n")
    else:
        num_of_no_predicted += 1
        f2.write("no  "+ '{:.5E}'.format(no_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "no":
            f2.write("correct\n")
        else:
            f2.write("wrong\n")

f.close()

yes_score 50.76936411365173
no_score 27.745082571459726
yes_score 27.096461044413562
no_score 8.94681808517889
yes_score 69.04785988748006
no_score 33.86403148863605
yes_score 27.717618929727458
no_score 7.408714539480215
yes_score 29.07271806287112
no_score 1.0563924633099118
yes_score 34.75136797707873
no_score 25.948893562786832
yes_score 20.157201895551463
no_score -5.166524281541942
yes_score 53.3002972625862
no_score 10.494338761829816
yes_score 38.50624181073952
no_score 27.847008106096222
yes_score 27.755134564959896
no_score 19.168934049984383
yes_score 28.282190495021865
no_score 14.497602222117537
yes_score 10.194101338532587
no_score 7.922920074939068
yes_score 71.35015035162105
no_score 44.74260680205152
yes_score 26.445909979810992
no_score 12.365484870746963
yes_score 5.735981912368849
no_score 4.430763144035014
yes_score 67.57240357733411
no_score 4.852792442360719
yes_score 19.527032357282497
no_score -1.1341411094257863
yes_score 41.82583982248652
no_score 14.05454447

In [None]:
# Outputting evaluation files for filterd vocabulary


