In [11]:
import pandas as pd
import math
import string
from collections import Counter
import copy

In [2]:
# utility func

# calculate probability of each word in specific class
def getProbDic(vocalbulary_dic, class_dic, total_words_num_in_class, vocabulary_length):
    smooth_num = 0.01
    word_prob = {} # key: word, value: probability of word, log10 format
    for key, val in vocalbulary_dic.items():
        fre = class_dic.get(key)

        prob = 0 
        # if word doesn't exist in yes_class_vocabulary, add 0.01 smooth
        if fre == None:
            prob = smooth_num / (total_words_num_in_class + vocabulary_length * smooth_num)
        else:
            prob = fre / total_words_num_in_class
        word_prob[key] = math.log10(prob)
    return word_prob

def cal_total_words(class_dic):
    total_words = 0
    for key, val in class_dic.items():
        total_words += val
    return total_words

def get_score(prob_class, class_dic, document_list):
    score = prob_class
    for word in document_list:
        if class_dic.get(word) != None:
            score += class_dic.get(word)
    return score

In [3]:
# read specific columns from csv file
df = pd.read_csv("covid_training.csv", usecols = ['tweet_id','text', 'q1_label'])
# print(df)

# count the number of documents in each class (yes/no)
# read text in each row of training set
yes_class_num = 0
no_class_num = 0
total_class_num = 0

yes_class_text = ""
no_class_text = ""
all_text = "" # concatenate all texts to one string

for i, col in df.iterrows():
    if col['q1_label'].lower() == "yes":
        yes_class_num += 1
        yes_class_text += (col['text'].lower() + " ")
    else:
        no_class_num += 1
        no_class_text += (col['text'].lower() + " ")
    all_text += (col['text'].lower() + " ")
    total_class_num += 1

# print(all_text) #test

# probability of each class
prob_class_yes = math.log10(yes_class_num / total_class_num)
prob_class_no = math.log10(no_class_num / total_class_num)




# map punctuation to space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

# generate vocabulary 
# count words frequency in all_text
# remove punctuation and quotation marks
removed_punc_text = all_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")



################# original_vocabulary #################

# calculate words frequency for original_vocabulary
original_vocabulary = dict(Counter(removed_punc_text.split()))
# print(original_vocabulary)

# count total vocabulary words in all_text based on original vocabulary
abs_ov_length = len(original_vocabulary)
# print(abs_ov_length)




# count words frequency in yes_class_text based on original_vocabulary
removed_punc_yes_class_text = yes_class_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")

# calculate words frequency
yes_class_vocabulary = dict(Counter(removed_punc_yes_class_text.split()))
# print(yes_class_vocabulary)

total_words_in_yes_class_text_ov = cal_total_words(yes_class_vocabulary)
# print(total_words_in_yes_class_text_ov)

# calculate probability of each word in class yes
word_prob_yes_class = getProbDic(original_vocabulary, yes_class_vocabulary, total_words_in_yes_class_text_ov, abs_ov_length)
# print(word_prob_yes_class["the"]) # test



# count words frequency in no_class_text based on original_vocabulary
removed_punc_no_class_text = no_class_text.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")

# calculate words frequency
no_class_vocabulary = dict(Counter(removed_punc_no_class_text.split()))
# print(no_class_vocabulary)

total_words_in_no_class_text_ov = cal_total_words(no_class_vocabulary)
# print(total_words_in_no_class_text_ov)

# calculate probability of each word in class no
word_prob_no_class = getProbDic(original_vocabulary, no_class_vocabulary, total_words_in_no_class_text_ov, abs_ov_length)
# print(word_prob_no_class["the"]) # test



In [15]:
################# filtered_vocabulary #################
# calculate words frequency for filtered_vocabulary
filtered_vocabulary = {key:val for key, val in original_vocabulary.items() if val != 1}

# count total vocabulary words in all_text based on filtered vocabulary
abs_fv_length = len(filtered_vocabulary)

# test
# abs_fv_length_dupli = cal_total_words(filtered_vocabulary)
# print(abs_fv_length_dupli)

# calculate class_dic based on filtered_vocabulary
yes_class_vocabulary_fv = copy.deepcopy(yes_class_vocabulary)
for key in yes_class_vocabulary.keys():
    # remove all words that doesn't exist in filtered_vocabulary
    if filtered_vocabulary.get(key) == None:
        del yes_class_vocabulary_fv[key]


total_words_in_yes_class_text_fv = cal_total_words(yes_class_vocabulary_fv)
# print(total_words_in_yes_class_text_fv)

# calculate probability of each word in class yes
word_prob_yes_class_fv = getProbDic(filtered_vocabulary, yes_class_vocabulary_fv, total_words_in_yes_class_text_fv, abs_fv_length)



# calculate class_dic based on filtered_vocabulary
no_class_vocabulary_fv = copy.deepcopy(no_class_vocabulary)
for key in no_class_vocabulary.keys():
    # remove all words that doesn't exist in filtered_vocabulary
    if filtered_vocabulary.get(key) == None:
        del no_class_vocabulary_fv[key]


total_words_in_no_class_text_fv = cal_total_words(no_class_vocabulary_fv)
# print(total_words_in_no_class_text_fv)

# calculate probability of each word in class no
word_prob_no_class_fv = getProbDic(filtered_vocabulary, no_class_vocabulary_fv, total_words_in_no_class_text_fv, abs_fv_length)



11355
7965
3390


In [10]:
################# original_vocabulary #################

# read specific columns from csv file
df_test = pd.read_csv("covid_test_public.csv", usecols = ['tweet_id','text', 'q1_label'])



# generate trace file based on model
f = open("_NB-BOW-OV.txt","w") 
for i, col in df_test.iterrows():
    f.write(str(col['tweet_id'])+ "  ") 

    # calculate the score in each class
    document = col['text'].lower()
    # print(document+"\n")
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    removed_punc_document = document.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
    word_list = removed_punc_document.split()
    # print(word_list, "\n")

    yes_score = get_score(prob_class_yes, yes_class_vocabulary, word_list)
    # no_score = get_score(prob_class_no, no_class_vocabulary, word_list)
    # print("yes_score", yes_score)
    # print("no_score", no_score)

    if yes_score > no_score:
        f.write("yes  "+ '{:.5E}'.format(yes_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "yes":
            f.write("correct\n")
        else:
            f.write("wrong\n")
    else:
        f.write("no  "+ '{:.5E}'.format(no_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "no":
            f.write("correct\n")
        else:
            f.write("wrong\n")

f.close()

['1', 'many', 'of', 'you', 'ask', 'me', 'why', 'i', 'take', 'the', 'covid', '19', 'outbreak', 'so', 'seriously', 'current', 'numbers', 'of', 'cases', 'and', 'deaths', 'are', 'not', 'why', 'e', 'a', 'thread', 'on', 'why', 'ia', 'm', 'worried', 'and', 'what', 'i', 'do', 'personally', 'in', 'this', 'situation']


yes_score 2153.791724057573
no_score 957.580870692258
['panic', 'buying', 'and', 'stockpiling', 'of', 'toilet', 'roll', 'continues', 'these', 'are', 'the', 'scenes', 'at', 'costco', 'in', 'farnborough', 'in', 'hampshire', 'today', 'coronavirus', 'toiletpaperpanic', 'panickbuying', 'https', 't', 'co', 'jllzvvs7eh']


yes_score 1600.791724057573
no_score 639.580870692258
['everyone', 'can', 'help', 'prevent', 'the', 'spread', 'of', 'covid19', 'call', 'your', 'doctor', 'if', 'you', 'develop', 'symptoms', 'have', 'been', 'in', 'close', 'contact', 'with', 'a', 'person', 'known', 'to', 'have', 'covid', '19', 'or', 'have', 'recently', 'traveled', 'from', 'an', 'area', 'with', 'widesprea

In [None]:
################# filtered_vocabulary #################

# read specific columns from csv file
df_test2 = pd.read_csv("covid_test_public.csv", usecols = ['tweet_id','text', 'q1_label'])



# generate trace file based on model
f2 = open("_NB-BOW-FV.txt","w") 
for i, col in df_test2.iterrows():
    f2.write(str(col['tweet_id'])+ "  ") 

    # calculate the score in each class
    document = col['text'].lower()
    # print(document+"\n")
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    removed_punc_document = document.translate(translator).replace("'"," ").replace("“"," ").replace("”"," ").replace("’"," ").replace("‘"," ")
    word_list = removed_punc_document.split()
    # print(word_list, "\n")

    yes_score = get_score(prob_class_yes, yes_class_vocabulary, word_list)
    # no_score = get_score(prob_class_no, no_class_vocabulary, word_list)
    # print("yes_score", yes_score)
    # print("no_score", no_score)

    if yes_score > no_score:
        f2.write("yes  "+ '{:.5E}'.format(yes_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "yes":
            f2.write("correct\n")
        else:
            f2.write("wrong\n")
    else:
        f2.write("no  "+ '{:.5E}'.format(no_score) + "  "+col['q1_label']+"  ")
        if col['q1_label'] == "no":
            f2.write("correct\n")
        else:
            f2.write("wrong\n")

f.close()