In [1]:
import pandas as pd
import numpy as np

# Load Training Data

In [2]:
training_file = '../data/covid_training.tsv'
train_df = pd.read_csv(training_file, sep='\t', usecols=[0,1,2])
train_df.head()

Unnamed: 0,tweet_id,text,q1_label
0,1241025578527903750,For the average American the best way to tell ...,no
1,1240467080954228737,this is fucking bullshit,no
2,1240716889162018816,Can y’all please just follow the government’s ...,no
3,1241062514886090754,No offense but the corona virus disappearing b...,no
4,1241447017945223169,This is the face of someone who just spent 9 h...,yes


In [3]:
X_train = pd.Series(train_df['text'].str.lower().str.split().values.copy(), index=train_df['tweet_id'])
X_train

tweet_id
1241025578527903750    [for, the, average, american, the, best, way, ...
1240467080954228737                        [this, is, fucking, bullshit]
1240716889162018816    [can, y’all, please, just, follow, the, govern...
1241062514886090754    [no, offense, but, the, corona, virus, disappe...
1241447017945223169    [this, is, the, face, of, someone, who, just, ...
                                             ...                        
1237310167814791168    [it's, like, suicide, but, minus, the, haram, ...
1237460460749766657    [the, first, silicon, valley, death, from, cov...
1237068715314892802    [breakthrough:, chloroquine, phosphate, --, an...
1237395939171561472    [i, know, everyone, is, freaking, out, about, ...
1237404677307760640    [ebola, has, a, 50%, death, rate., if, you, go...
Length: 399, dtype: object

In [4]:
y_train = pd.Series(train_df['q1_label'].astype('category').values.copy(), index=train_df['tweet_id'])
print(dict(enumerate(y_train.cat.categories)))

{0: 'no', 1: 'yes'}


# Build Model

In [5]:
CLASSES = y_train.unique()
CLASSES

['no', 'yes']
Categories (2, object): ['no', 'yes']

## Build Vector Space Representation

In [6]:
from collections import Counter
vec_space_df = pd.DataFrame(X_train.apply(Counter).tolist(), index=X_train.index).fillna(0)
vec_space_df.iloc[:5,-5:]

Unnamed: 0_level_0,obviously,ebolaâbut,34x,https://t.co/bcyoeedray,https://t.co/ga2rdghpiq
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1241025578527903750,0.0,0.0,0.0,0.0,0.0
1240467080954228737,0.0,0.0,0.0,0.0,0.0
1240716889162018816,0.0,0.0,0.0,0.0,0.0
1241062514886090754,0.0,0.0,0.0,0.0,0.0
1241447017945223169,0.0,0.0,0.0,0.0,0.0


## Class Probability

In [7]:
class_log_proba = np.log10((y_train.value_counts()/len(y_train)))

## Vocabulary Probability

In [8]:
# P(w | c) = freq / total
ov_word_freqs = pd.DataFrame(dtype=np.float64)
ov_word_totals = pd.Series(dtype=np.float64)

In [9]:
fv_word_freqs = pd.DataFrame(dtype=np.float64)
fv_word_totals = pd.Series(dtype=np.float64)

### Count Words

In [10]:
for c in CLASSES:
    # frequency per class c = sum all rows where target is c
    ov_word_freqs[c] = vec_space_df[y_train==c].sum()
    # total per class = sum all word freqs where target is c
    ov_word_totals[c] = ov_word_freqs[c].sum()
print(ov_word_freqs)
print(ov_word_totals)

                            no    yes
for                       34.0   90.0
the                      132.0  321.0
average                    1.0    0.0
american                   4.0    7.0
best                       4.0    2.0
...                        ...    ...
obviously                  0.0    1.0
ebolaâbut                0.0    1.0
34x                        0.0    1.0
https://t.co/bcyoeedray    0.0    1.0
https://t.co/ga2rdghpiq    0.0    1.0

[4304 rows x 2 columns]
no     3741.0
yes    8844.0
dtype: float64


In [11]:
mask_less_than_2 = ov_word_freqs.sum(axis=1)>=2
fv_word_freqs = ov_word_freqs.loc[mask_less_than_2].copy()
print(fv_word_freqs)
for c in CLASSES:
    fv_word_totals[c] = fv_word_freqs[c].sum()
print(fv_word_totals)

                no    yes
for           34.0   90.0
the          132.0  321.0
american       4.0    7.0
best           4.0    2.0
way            5.0    6.0
...            ...    ...
ng             0.0    2.0
na             0.0    2.0
60s.           0.0    2.0
chloroquine    0.0    2.0
rate.          0.0    2.0

[1193 rows x 2 columns]
no     2833.0
yes    6641.0
dtype: float64


### Smoothing

In [12]:
def smooth(freqs: pd.DataFrame, totals: pd.DataFrame, classes, delta: float):
    freqs_ = freqs.copy()
    totals_ = totals.copy()
    size_vocab = len(freqs_)
    for c in classes:
        freqs_[c] += delta
        totals_[c] += delta * size_vocab
    return freqs_, totals_

In [13]:
delta = 0.01
ov_smoothed_freqs, ov_smoothed_totals = smooth(ov_word_freqs, ov_word_totals, CLASSES, delta)
fv_smoothed_freqs, fv_smoothed_totals = smooth(fv_word_freqs, fv_word_totals, CLASSES, delta)
print(ov_smoothed_freqs.min())
print(ov_smoothed_totals)

no     0.01
yes    0.01
dtype: float64
no     3784.04
yes    8887.04
dtype: float64


### Word Probability

In [14]:
ov_model = np.log10(ov_smoothed_freqs / ov_smoothed_totals)
fv_model = np.log10(fv_smoothed_freqs / fv_smoothed_totals)

# Predict

## Define Functions

### Trace Files

In [15]:
def predict_single(model: pd.DataFrame, class_log_proba: pd.Series, word_list: list):
    V = model.index
    classes = class_log_proba.index
    scores_per_class = {
        # key := class c
        c : 
        # val := log(P(class)) + sum of log(w|class) for w in V
        class_log_proba[c] + sum([
            model.loc[word, c] for word in word_list if word in V
        ])
        
        for c in classes
    }
    
    # class with max score
    c = max(scores_per_class, key=scores_per_class.get)
    return c, format(scores_per_class[c], '.2E')  # class, score

def predict(model: pd.DataFrame, class_log_proba: pd.Series, X):
    df = X.apply(lambda x: predict_single(model, class_log_proba, x))
    return pd.DataFrame(df.tolist(), index=df.index, columns=['pred', 'score'])

def trace_predict(model: pd.DataFrame, class_log_proba: pd.Series, X, y, path):
    verdict_mapping = {'verdict':{True:'correct',False:'wrong'}}
    
    trace_df_ = predict(model, class_log_proba, X)
    trace_df_['target'] = y
    trace_df_['verdict'] = (trace_df_.target == trace_df_.pred).replace({True:'correct',False:'wrong'})
    
    np.savetxt(path, trace_df_.reset_index(), delimiter='  ', fmt='%s')
    return trace_df_

### Overall Evaluation Files

In [16]:
CLASSES = ('yes', 'no')
def accuracy_score(y_test: pd.Series, y_pred: pd.Series):
    count_pred = len(y_pred)
    return (y_test == y_pred).value_counts().loc[True] / count_pred

def precision_scores(y_test: pd.Series, y_pred: pd.Series):
    scores = pd.Series(index=CLASSES, dtype=np.float64)
    for c in CLASSES:
        select_positive_pred = y_pred == c
        count_positive_pred = len(y_pred[select_positive_pred])
        scores.loc[c] = (y_test[select_positive_pred] == y_pred[select_positive_pred]).value_counts().loc[True]  / count_positive_pred
    return scores

def recall_scores(y_test: pd.Series, y_pred: pd.Series):
    scores = pd.Series(index=CLASSES, dtype=np.float64)
    for c in CLASSES:
        select_positive_pred = y_pred == c
        select_positive_target = y_test == c
        count_positive_target = len(y_pred[select_positive_target])
        scores.loc[c] = (y_test[select_positive_pred] == y_pred[select_positive_pred]).value_counts().loc[True]  / count_positive_target
    return scores

def f1_scores(y_test: pd.Series, y_pred: pd.Series):
    prec = precision_scores(y_test, y_pred)
    reca = recall_scores(y_test, y_pred)
    return 2 * prec * reca / (prec + reca)

def eval_model(y_test, y_pred, path):
    metrics = [precision_scores, recall_scores, f1_scores]
    header = '{:5s}\t{:5s}\n'.format(*CLASSES)
    fmt = '{:.4f}  {:.4f}\n'
    lines = ['{:.4f}\n'.format(accuracy_score(y_test, y_pred))] + [fmt.format(*scores(y_test, y_pred).tolist()) for scores in metrics]
    with open(path, 'w') as f:
        f.writelines(lines)
    return ''.join([header] + lines)

## Load Test Data

In [17]:
testing_file = '../data/covid_test_public.tsv'
test_df = pd.read_csv(testing_file, sep='\t', usecols=[0,1,2], names=train_df.columns)
X_test = pd.Series(test_df['text'].str.lower().str.split().values.copy(), index=test_df['tweet_id'])
y_test = pd.Series(test_df['q1_label'].astype('category').values.copy(), index=test_df['tweet_id'])
test_df.head()

Unnamed: 0,tweet_id,text,q1_label
0,1236932313642143745,1/ Many of you ask me why I take the COVID-19 ...,no
1,1236289649737371648,Panic buying and stockpiling of toilet roll co...,yes
2,1237501242567544835,Everyone can help prevent the spread of #COVID...,no
3,1237029903112888321,Isa sa pinakamaganda mong maa-ambag about COVI...,no
4,1237270988334592001,BREAKING: Department of Health announces 11 mo...,yes


## NB-BOW-OV Outputs

In [18]:
ov_trace = trace_predict(ov_model, class_log_proba, X_test, y_test, '../out/trace_NB-BOW-OV.txt')
print(eval_model(ov_trace.target, ov_trace.pred, '../out/eval_NB-BOW-OV.txt'))
ov_trace.head()

yes  	no   
0.6727
0.6667  0.7000
0.9091  0.3182
0.7692  0.4375



Unnamed: 0_level_0,pred,score,target,verdict
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1236932313642143745,no,-93.3,no,correct
1236289649737371648,yes,-33.0,yes,correct
1237501242567544835,yes,-105.0,no,wrong
1237029903112888321,yes,-67.9,no,wrong
1237270988334592001,yes,-47.2,yes,correct


## NB-BOW-FV Outputs

In [19]:
fv_trace = trace_predict(fv_model, class_log_proba, X_test, y_test, '../out/trace_NB-BOW-FV.txt')
print(eval_model(fv_trace.target, fv_trace.pred, '../out/eval_NB-BOW-FV.txt'))
fv_trace.head()

yes  	no   
0.7455
0.7209  0.8333
0.9394  0.4545
0.8158  0.5882



Unnamed: 0_level_0,pred,score,target,verdict
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1236932313642143745,yes,-70.4,no,wrong
1236289649737371648,yes,-27.5,yes,correct
1237501242567544835,yes,-89.0,no,wrong
1237029903112888321,yes,-61.4,no,wrong
1237270988334592001,yes,-37.5,yes,correct
