In [60]:
import pandas as pd
import math
import string
from collections import Counter
import copy
import re
from nltk.stem.porter import PorterStemmer
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yilul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yilul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
# utility func

# preprocess text
def preprocess_text(text_str):
    # remove punctuation and numbers
    removed_punc_text = re.sub('[^A-Za-z]', ' ', text_str)
    removed_punc_text_list = removed_punc_text.split()
    # print(len(removed_punc_text_list)) #test

    # remove stop words
    stop_words = set(stopwords.words('english'))  
    removed_punc_stop_text_list = [w for w in removed_punc_text_list if not w in stop_words]  
    # print(len(removed_punc_stop_text_list)) #test

    # stemming (base words)
    stemmer = PorterStemmer()
    for i in range(len(removed_punc_stop_text_list)):
        removed_punc_stop_text_list[i] = stemmer.stem(removed_punc_stop_text_list[i])
    return removed_punc_stop_text_list



# calculate probability of each word in specific class
def getProbDic(vocalbulary_dic, class_dic, total_words_num_in_class, vocabulary_length):
    smooth_num = 0.01
    word_prob = {} # key: word, value: probability of word, log10 format
    for key, val in vocalbulary_dic.items():
        fre = class_dic.get(key)

        prob = 0 
        # if word doesn't exist in yes_class_vocabulary, add 0.01 smooth
        if fre == None:
            prob = smooth_num / (total_words_num_in_class + vocabulary_length * smooth_num)
        else:
            prob = fre + smooth_num / (total_words_num_in_class + vocabulary_length * smooth_num)
        word_prob[key] = math.log10(prob)
        # print(word_prob[key])
    return word_prob

def cal_total_words(class_dic):
    total_words = 0
    for key, val in class_dic.items():
        total_words += val
    return total_words

def get_score(prob_class, class_dic, document_list):
    score = prob_class
    for word in document_list:
        if class_dic.get(word) != None:
            score += class_dic.get(word)
    return score

In [62]:
# read specific columns from csv file
df = pd.read_csv("covid_training.csv", usecols = ['tweet_id','text', 'q1_label'])
# print(df)

# count the number of documents in each class (yes/no)
# read text in each row of training set
yes_class_num = 0
no_class_num = 0
total_class_num = 0

yes_class_text = ""
no_class_text = ""
all_text = "" # concatenate all texts to one string

for i, col in df.iterrows():
    if col['q1_label'].lower() == "yes":
        yes_class_num += 1
        yes_class_text += (col['text'].lower() + " ")
    else:
        no_class_num += 1
        no_class_text += (col['text'].lower() + " ")
    all_text += (col['text'].lower() + " ")
    total_class_num += 1

# print(all_text) #test

# probability of each class
prob_class_yes = math.log10(yes_class_num / total_class_num)
prob_class_no = math.log10(no_class_num / total_class_num)


########################## preprocess all_text

# map punctuation to space
# translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

# generate vocabulary 
# count words frequency in all_text
# remove punctuation and quotation marks
# removed_punc_text = all_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
all_text_list = preprocess_text(all_text)




################# original_vocabulary #################

# calculate words frequency for original_vocabulary
original_vocabulary = dict(Counter(all_text_list))
# print(original_vocabulary)

# count total vocabulary words in all_text based on original vocabulary
abs_ov_length = len(original_vocabulary)
# print(abs_ov_length)

# test
# print(original_vocabulary.get("the")) # should be None

# test
abs_ov_length_dupli = cal_total_words(original_vocabulary)
# print(abs_ov_length_dupli)




# count words frequency in yes_class_text based on original_vocabulary
# removed_punc_yes_class_text = yes_class_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
yes_class_text_list = preprocess_text(yes_class_text)

# calculate words frequency
yes_class_vocabulary = dict(Counter(yes_class_text_list))
# print(yes_class_vocabulary)

total_words_in_yes_class_text_ov = cal_total_words(yes_class_vocabulary)
# print(total_words_in_yes_class_text_ov)

# calculate probability of each word in class yes
word_prob_yes_class = getProbDic(original_vocabulary, yes_class_vocabulary, total_words_in_yes_class_text_ov, abs_ov_length)
# print(word_prob_yes_class["the"]) # test



# count words frequency in no_class_text based on original_vocabulary
# removed_punc_no_class_text = no_class_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
no_class_text_list = preprocess_text(no_class_text)

# calculate words frequency
no_class_vocabulary = dict(Counter(no_class_text_list))
# print(no_class_vocabulary)

total_words_in_no_class_text_ov = cal_total_words(no_class_vocabulary)
# print(total_words_in_no_class_text_ov)

# calculate probability of each word in class no
word_prob_no_class = getProbDic(original_vocabulary, no_class_vocabulary, total_words_in_no_class_text_ov, abs_ov_length)
# print(word_prob_no_class["the"]) # test



7820
5550
2270


In [64]:
################# filtered_vocabulary #################
# calculate words frequency for filtered_vocabulary
filtered_vocabulary = {key:val for key, val in original_vocabulary.items() if val != 1}
print(filtered_vocabulary)

# count total vocabulary words in all_text based on filtered vocabulary
abs_fv_length = len(filtered_vocabulary)

# test
abs_fv_length_dupli = cal_total_words(filtered_vocabulary)
print(abs_fv_length_dupli)

# calculate class_dic based on filtered_vocabulary
yes_class_vocabulary_fv = copy.deepcopy(yes_class_vocabulary)
for key in yes_class_vocabulary.keys():
    # remove all words that doesn't exist in filtered_vocabulary
    if filtered_vocabulary.get(key) == None:
        del yes_class_vocabulary_fv[key]


total_words_in_yes_class_text_fv = cal_total_words(yes_class_vocabulary_fv)
# print(total_words_in_yes_class_text_fv)

# calculate probability of each word in class yes
word_prob_yes_class_fv = getProbDic(filtered_vocabulary, yes_class_vocabulary_fv, total_words_in_yes_class_text_fv, abs_fv_length)



# calculate class_dic based on filtered_vocabulary
no_class_vocabulary_fv = copy.deepcopy(no_class_vocabulary)
for key in no_class_vocabulary.keys():
    # remove all words that doesn't exist in filtered_vocabulary
    if filtered_vocabulary.get(key) == None:
        del no_class_vocabulary_fv[key]


total_words_in_no_class_text_fv = cal_total_words(no_class_vocabulary_fv)
# print(total_words_in_no_class_text_fv)

# calculate probability of each word in class no
word_prob_no_class_fv = getProbDic(filtered_vocabulary, no_class_vocabulary_fv, total_words_in_no_class_text_fv, abs_fv_length)



{'american': 15, 'best': 6, 'way': 13, 'tell': 9, 'covid': 183, 'cough': 7, 'person': 13, 'face': 8, 'wait': 4, 'test': 60, 'result': 4, 'fuck': 12, 'pleas': 26, 'follow': 10, 'govern': 18, 'instruct': 2, 'knock': 2, 'done': 6, 'feel': 8, 'like': 34, 'keep': 7, 'lose': 3, 'time': 27, 'one': 24, 'two': 10, 'kid': 5, 'direct': 3, 'corona': 91, 'viru': 85, 'disappear': 2, 'april': 2, 'actual': 7, 'suck': 2, 'someon': 7, 'spent': 3, 'hour': 2, 'protect': 12, 'move': 2, 'critic': 4, 'ill': 7, 'patient': 18, 'around': 9, 'start': 9, 'peopl': 62, 'social': 9, 'distanc': 2, 'self': 10, 'isol': 5, 'http': 187, 'co': 186, 'door': 4, 'told': 9, 'stay': 10, 'free': 4, 'month': 8, 'employ': 2, 'need': 26, 'bori': 3, 'moral': 2, 'correct': 4, 'someth': 9, 'better': 4, 'never': 10, 'hear': 10, 'anyon': 6, 'end': 7, 'worker': 14, 'gener': 6, 'food': 4, 'employe': 2, 'even': 8, 'think': 14, 'deserv': 2, 'surviv': 4, 'dr': 5, 'past': 5, 'week': 28, 'treat': 4, 'pakistan': 4, 'knew': 6, 'ppe': 3, 'today'

In [63]:
################# original_vocabulary #################

# read specific columns from csv file
df_test = pd.read_csv("covid_test_public.csv", usecols = ['tweet_id','text', 'q1_label'])



# generate trace file based on model
f = open("_NB-BOW-OV.txt","w") 
for i, col in df_test.iterrows():
    f.write(str(col['tweet_id'])+ "  ") 

    # calculate the score in each class
    document = col['text'].lower()
    # print(document+"\n")
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    removed_punc_document = document.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
    word_list = removed_punc_document.split()
    # print(word_list, "\n")

    yes_score = get_score(prob_class_yes, word_prob_yes_class, word_list)
    no_score = get_score(prob_class_no, word_prob_no_class, word_list)
    print("yes_score", yes_score)
    print("no_score", no_score)

    if yes_score > no_score:
        f.write("yes  "+ '{:.5E}'.format(yes_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "yes":
            f.write("correct\n")
        else:
            f.write("wrong\n")
    else:
        f.write("no  "+ '{:.5E}'.format(no_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "no":
            f.write("correct\n")
        else:
            f.write("wrong\n")

f.close()

yes_score 2.000595227066772
no_score -6.712027963585733
yes_score 4.092754624038468
no_score -2.693935262534704
yes_score 20.01355654987832
no_score -5.013177559247103
yes_score 11.187859863276786
no_score -20.443345793894615
yes_score 3.6210290335543154
no_score -8.879010891325217
yes_score 3.5513938891481356
no_score -8.657160503124809
yes_score -1.8243444561648654
no_score -8.754070894266306
yes_score 5.380699764521687
no_score -23.88332581321126
yes_score 7.320418108182008
no_score -16.400939758921538
yes_score -4.114044436741834
no_score -0.15422900819191643
yes_score 3.6948158029132103
no_score -9.356128994922122
yes_score 2.7337322280706915
no_score 1.8433224432088282
yes_score 13.634335639179945
no_score -2.5655629234679767
yes_score 7.018968684303738
no_score -0.7009390387307473
yes_score -3.653646575688291
no_score -8.352584581351149
yes_score 15.24469795273427
no_score -23.534880844647027
yes_score 0.3059887239641659
no_score 3.5422941491539763
yes_score 3.3894202098188564
n

In [66]:
################# filtered_vocabulary #################

# generate trace file based on model
f2 = open("_NB-BOW-FV.txt","w") 
for i, col in df_test.iterrows():
    f2.write(str(col['tweet_id'])+ "  ") 

    # calculate the score in each class
    document = col['text'].lower()
    # print(document+"\n")
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    removed_punc_document = document.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
    word_list = removed_punc_document.split()
    # print(word_list, "\n")

    yes_score = get_score(prob_class_yes, word_prob_yes_class_fv, word_list)
    no_score = get_score(prob_class_no, word_prob_no_class_fv, word_list)
    print("yes_score", yes_score)
    print("no_score", no_score)

    if yes_score > no_score:
        f2.write("yes  "+ '{:.5E}'.format(yes_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "yes":
            f2.write("correct\n")
        else:
            f2.write("wrong\n")
    else:
        f2.write("no  "+ '{:.5E}'.format(no_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "no":
            f2.write("correct\n")
        else:
            f2.write("wrong\n")

f.close()

yes_score 7.746997602352106
no_score -1.2291975664797716
yes_score 4.092754779735124
no_score -2.572270496174484
yes_score 20.01355692675082
no_score -4.648181818376251
yes_score 11.187857083561257
no_score 1.122986994909096
yes_score 3.6210283706063997
no_score -3.396179529015383
yes_score 3.5513924488490973
no_score 2.0651735749774063
yes_score -1.7195948271552637
no_score -8.510741310438773
yes_score 5.380699888858835
no_score -18.0354997675694
yes_score 7.320416103322234
no_score -0.19577424188362857
yes_score 1.7371080776837673
no_score -0.03256576886294127
yes_score 3.694814300276452
no_score 1.3662055722105948
yes_score 2.7337322601142984
no_score 1.8433226569936079
yes_score 13.634335065635891
no_score 2.9172692786396155
yes_score 7.0189679237258495
no_score 4.660228277491001
yes_score -3.54889748683808
no_score -2.8697524806059462
yes_score 15.244697091342255
no_score -12.325887318670361
yes_score 0.4107381924861149
no_score 3.54229491310039
yes_score 3.3894204734005813
no_sco