In [1]:
import numpy as np
import preprocess.prepare as prepare
import preprocess.word_bag as word_bag
import model.naive_bayes as naive_bayes
import model.nn as nn
import model.trivial as trivial
import metrics.metrics as metrics
from utils.cache import cached
from utils.data import train_dev_split
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


# Part 1: Data Processing

In [2]:
# Import and preprocess data
data = cached(lambda:prepare.CratesData(), "crates_data.pkl")

Loading cached object from .cs229_cache\crates_data.pkl


In [3]:
# Selection data
crates = [crate for crate in data.id2crates.values() if crate.category_indices]
len(crates)

32870

In [4]:
# Generate word bag
dic = cached(lambda:word_bag.create_dictionary(crates), "dictionary.pkl")
X, y = cached(lambda: word_bag.transform_crate(crates, dic, len(data.categories)), "wb10000.pkl")
X = X.astype(np.float32)
y = y.astype(np.float32)

Loading cached object from .cs229_cache\dictionary.pkl
Loading cached object from .cs229_cache\wb10000.pkl


In [6]:
len(dic)

42912

In [74]:
# Split train and dev datasets
X_train, y_train, X_dev, y_dev = train_dev_split(X, y, train_ratio=0.8, seed=0)

# Part 2: Model Implementation

### Model 0a: Always false

In [75]:
# Make predictions
pred_a0_train = trivial.AlwaysFalseModel(len(data.categories)).predict(X_train)
pred_a0_dev = trivial.AlwaysFalseModel(len(data.categories)).predict(X_dev)

In [78]:
# Evaluate performance for train dataset
metric_a0_train = metrics.evaluate_performance(y_train, pred_a0_train)
for k, v in metric_a0_train.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.0
hamming_score: 0.9775566032244919
precision_micro: 1.0
precision_macro: 1.0
precision_weighted: 1.0
precision_samples: 1.0
recall_micro: 0.0
recall_macro: 0.024096385542168676
recall_weighted: 0.0
recall_samples: 0.0
f1_micro: 0.0
f1_macro: 0.024096385542168676
f1_weighted: 0.0
f1_samples: 0.0


In [79]:
# Evaluate performance for dev dataset
metric_a0_dev = metrics.evaluate_performance(y_dev, pred_a0_dev)
for k, v in metric_a0_dev.items():
    print(k + ": " + str(v))


exact_match_ratio: 0.0
hamming_score: 0.9778200760997303
precision_micro: 1.0
precision_macro: 1.0
precision_weighted: 1.0
precision_samples: 1.0
recall_micro: 0.0
recall_macro: 0.04819277108433735
recall_weighted: 0.0
recall_samples: 0.0
f1_micro: 0.0
f1_macro: 0.04819277108433735
f1_weighted: 0.0
f1_samples: 0.0


### Model 0b: Random Labeling

In [82]:
# Make predictions
pred_randl_train = trivial.RandomLabelingModel(len(data.categories)).predict(X_train)
pred_randl_dev = trivial.RandomLabelingModel(len(data.categories)).predict(X_dev)

In [83]:
# Evaluate performance for train dataset
metric_randl_train = metrics.evaluate_performance(y_train, pred_randl_train)
for k, v in metric_randl_train.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.0
hamming_score: 0.5005007857517024
precision_micro: 0.02247302035044414
precision_macro: 0.02245827275803694
precision_weighted: 0.05814838632246531
precision_samples: 0.022478993639010892
recall_micro: 0.5001659751037344
recall_macro: 0.5159583284465401
recall_weighted: 0.5001659751037344
recall_samples: 0.5006460547504026
f1_micro: 0.04301340020500413
f1_macro: 0.04048267767100961
f1_weighted: 0.09891511518339502
f1_samples: 0.04250368941827535


In [84]:
# Evaluate performance for dev dataset
metric_randl_dev = metrics.evaluate_performance(y_dev, pred_randl_dev)
for k, v in metric_randl_dev.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.0
hamming_score: 0.4997606751010845
precision_micro: 0.022320481470315658
precision_macro: 0.022329784307113885
precision_weighted: 0.059215327591118014
precision_samples: 0.022329664869564968
recall_micro: 0.5035687295322865
recall_macro: 0.5380338541300888
recall_weighted: 0.5035687295322865
recall_samples: 0.4998402638223321
f1_micro: 0.042746252485868864
f1_macro: 0.04021593100385741
f1_weighted: 0.1002126393371743
f1_samples: 0.04223117544189479


### Model 0c: Random Assigning

In [100]:
# Make predictions
pred_randa_train = trivial.RandomAssigningModel(len(data.categories)).predict(X_train)
pred_randa_dev = trivial.RandomAssigningModel(len(data.categories)).predict(X_dev)

In [101]:
# Evaluate performance for train dataset
metric_randa_train = metrics.evaluate_performance(y_train, pred_randa_train)
for k, v in metric_randa_train.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.005449275362318841
hamming_score: 0.9660364355974622
precision_micro: 0.02191304347826087
precision_macro: 0.022194974997056856
precision_weighted: 0.056692731508664246
precision_samples: 0.02191304347826087
recall_micro: 0.011763485477178422
recall_macro: 0.03581430874503741
recall_weighted: 0.011763485477178422
recall_samples: 0.01186731078904992
f1_micro: 0.015308808639891999
f1_macro: 0.011841893949020075
f1_weighted: 0.01718443573130769
f1_samples: 0.014533977455716584


In [102]:
# Evaluate performance for dev dataset
metric_randa_dev = metrics.evaluate_performance(y_dev, pred_randa_dev)
for k, v in metric_randa_dev.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.005101252125521719
hamming_score: 0.9662821947862558
precision_micro: 0.021177925490802288
precision_macro: 0.02107686675449831
precision_weighted: 0.0568092820978618
precision_samples: 0.021177925490802288
recall_micro: 0.011503904609958855
recall_macro: 0.058343311704787876
recall_weighted: 0.011503904609958855
recall_samples: 0.011068171278404699
f1_micro: 0.014909130482098161
f1_macro: 0.010855000299093892
f1_weighted: 0.017067795792531208
f1_samples: 0.013665172360488484


### Model 1: Naive Bayes

In [85]:
# Fit model
model_nb = naive_bayes.NaiveBayesModel(num_categories=len(data.categories))
model_nb.fit(X_train, y_train)

Fitting model:   0%|          | 0/83 [00:00<?, ?it/s]Fitting model:   1%|          | 1/83 [00:04<06:39,  4.87s/it]Fitting model:   2%|▏         | 2/83 [00:08<05:41,  4.22s/it]Fitting model:   4%|▎         | 3/83 [00:12<05:20,  4.01s/it]Fitting model:   5%|▍         | 4/83 [00:16<05:21,  4.07s/it]Fitting model:   6%|▌         | 5/83 [00:20<05:00,  3.86s/it]Fitting model:   7%|▋         | 6/83 [00:24<05:00,  3.90s/it]Fitting model:   8%|▊         | 7/83 [00:27<04:52,  3.85s/it]Fitting model:  10%|▉         | 8/83 [00:31<04:42,  3.77s/it]Fitting model:  11%|█         | 9/83 [00:34<04:32,  3.69s/it]Fitting model:  12%|█▏        | 10/83 [00:38<04:20,  3.57s/it]Fitting model:  13%|█▎        | 11/83 [00:41<04:13,  3.52s/it]Fitting model:  14%|█▍        | 12/83 [00:45<04:08,  3.49s/it]Fitting model:  16%|█▌        | 13/83 [00:48<04:02,  3.46s/it]Fitting model:  17%|█▋        | 14/83 [00:51<03:57,  3.45s/it]Fitting model:  18%|█▊        | 15/83 [00:55<03:52,  3.42s/it]Fitting mo

In [86]:
# Make predictions
pred_nb_train = model_nb.predict(X_train)
pred_nb_dev = model_nb.predict(X_dev)

Predicting categories:   0%|          | 0/83 [00:00<?, ?it/s]Predicting categories:   1%|          | 1/83 [00:00<00:20,  4.05it/s]Predicting categories:   2%|▏         | 2/83 [00:00<00:14,  5.43it/s]Predicting categories:   4%|▎         | 3/83 [00:00<00:12,  6.42it/s]Predicting categories:   5%|▍         | 4/83 [00:00<00:10,  7.43it/s]Predicting categories:   8%|▊         | 7/83 [00:00<00:06, 11.49it/s]Predicting categories:  11%|█         | 9/83 [00:00<00:06, 11.37it/s]Predicting categories:  13%|█▎        | 11/83 [00:01<00:06, 11.49it/s]Predicting categories:  16%|█▌        | 13/83 [00:01<00:05, 11.68it/s]Predicting categories:  18%|█▊        | 15/83 [00:01<00:06, 10.70it/s]Predicting categories:  20%|██        | 17/83 [00:01<00:06, 10.49it/s]Predicting categories:  23%|██▎       | 19/83 [00:01<00:06, 10.54it/s]Predicting categories:  27%|██▋       | 22/83 [00:02<00:04, 12.44it/s]Predicting categories:  29%|██▉       | 24/83 [00:02<00:05, 11.67it/s]Predicting categories

In [87]:
# Evaluate performance for train dataset
metric_nb_train = metrics.evaluate_performance(y_train, pred_nb_train)
for k, v in metric_nb_train.items():
    print(k + ": " + str(v))

ValueError: Found input variables with inconsistent numbers of samples: [25875, 3]

In [None]:
# Evaluate performance for dev dataset
metric_nb_dev = metrics.evaluate_performance(y_dev, pred_nb_dev)
for k, v in metric_nb_dev.items():
    print(k + ": " + str(v))

### Model 2: Logistic Regression

In [95]:
# Fit model
model_lr = nn.DNNModel(num_categories=len(data.categories), learning_rate=0.0001, reg=0.000)
model_lr.fit(nn.LogisticRegression, X_train, y_train, X_dev, y_dev, epochs=20)

Epoch: 0, Loss: 0.0050, val_loss: 0.0040, Hamming Distance: 0.0202, val_hamming_dist: 0.0204
Epoch: 1, Loss: 0.0035, val_loss: 0.0034, Hamming Distance: 0.0194, val_hamming_dist: 0.0225
Epoch: 2, Loss: 0.0029, val_loss: 0.0032, Hamming Distance: 0.0168, val_hamming_dist: 0.0193
Epoch: 3, Loss: 0.0025, val_loss: 0.0029, Hamming Distance: 0.0158, val_hamming_dist: 0.0212
Epoch: 4, Loss: 0.0023, val_loss: 0.0028, Hamming Distance: 0.0146, val_hamming_dist: 0.0182
Epoch: 5, Loss: 0.0020, val_loss: 0.0025, Hamming Distance: 0.0128, val_hamming_dist: 0.0192
Epoch: 6, Loss: 0.0018, val_loss: 0.0024, Hamming Distance: 0.0122, val_hamming_dist: 0.0175
Epoch: 7, Loss: 0.0016, val_loss: 0.0022, Hamming Distance: 0.0099, val_hamming_dist: 0.0176
Epoch: 8, Loss: 0.0015, val_loss: 0.0022, Hamming Distance: 0.0098, val_hamming_dist: 0.0170
Epoch: 9, Loss: 0.0013, val_loss: 0.0021, Hamming Distance: 0.0084, val_hamming_dist: 0.0169
Epoch: 10, Loss: 0.0012, val_loss: 0.0020, Hamming Distance: 0.0084, v

In [96]:
# Make predictions
pred_lr_train = model_lr.predict(X_train)
pred_lr_dev = model_lr.predict(X_dev)

In [97]:
# Evaluate performance for train dataset
metric_lr_train = metrics.evaluate_performance(y_train, pred_lr_train)
for k, v in metric_lr_train.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.7102222222222222
hamming_score: 0.9945740061696059
precision_micro: 0.9738855320142112
precision_macro: 0.9817735535727087
precision_weighted: 0.9742572750838704
precision_samples: 0.9777726247987115
recall_micro: 0.7791286307053942
recall_macro: 0.7426233315575526
recall_weighted: 0.7791286307053942
recall_samples: 0.7781745571658615
f1_micro: 0.8656885005935846
f1_macro: 0.8415025022384366
f1_weighted: 0.8644996397455661
f1_samples: 0.787699638203986


In [98]:
# Evaluate performance for dev dataset
metric_lr_dev = metrics.evaluate_performance(y_dev, pred_lr_dev)
for k, v in metric_lr_dev.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.2731488638120266
hamming_score: 0.9832807811862692
precision_micro: 0.7268647477561127
precision_macro: 0.7246169115142638
precision_weighted: 0.71787201141928
precision_samples: 0.8530813010523941
recall_micro: 0.39440759089764044
recall_macro: 0.31646299059637334
recall_weighted: 0.39440759089764044
recall_samples: 0.4136780543103004
f1_micro: 0.5113494093952424
f1_macro: 0.41134598122975663
f1_weighted: 0.4990349950199903
f1_samples: 0.41422893045109405


### Model 3: Nerual Network

In [93]:
# Fit model
model_nn = nn.DNNModel(num_categories=len(data.categories), learning_rate=0.0001, reg=0.000)
model_nn.fit(nn.MyNet, X_train, y_train, X_dev, y_dev, epochs=20)

TypeError: unsupported format string passed to tuple.__format__

In [89]:
# Make predictions
pred_nn_train = model_nn.predict(X_train)
pred_nn_dev = model_nn.predict(X_dev)

In [90]:
# Evaluate performance for train dataset
metric_nn_train = metrics.evaluate_performance(y_train, pred_nn_train)
for k, v in metric_nn_train.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.7337971014492753
hamming_score: 0.9946908794598684
precision_micro: 0.8958306441203047
precision_macro: 0.9063990462515596
precision_weighted: 0.9067004409559278
precision_samples: 0.9224912353347136
recall_micro: 0.8639004149377594
recall_macro: 0.7477845210964775
recall_weighted: 0.8639004149377594
recall_samples: 0.894112077294686
f1_micro: 0.8795758433493166
f1_macro: 0.7933051891740572
f1_weighted: 0.875876399563169
f1_samples: 0.8851302078490484


In [91]:
# Evaluate performance for dev dataset
metric_nn_dev = metrics.evaluate_performance(y_dev, pred_nn_dev)
for k, v in metric_nn_dev.items():
    print(k + ": " + str(v))

exact_match_ratio: 0.30514762714484467
hamming_score: 0.9817908207260948
precision_micro: 0.6009852216748769
precision_macro: 0.6604618164519468
precision_weighted: 0.6310058768211979
precision_samples: 0.6636184292854567
recall_micro: 0.5327063565370728
recall_macro: 0.41715158266001395
recall_weighted: 0.5327063565370728
recall_samples: 0.5693048899881487
f1_micro: 0.5647896728243935
f1_macro: 0.46691600584627413
f1_weighted: 0.5512077467143873
f1_samples: 0.5392691350490083
