# Summary

This notebook is for evaluate the best model on testing dataset

- decision tree: gini + 200 depth
- KNN: 1 nearest neighboors with distance
- SVM: 1000 + linear
- Ada-boost: decision tree 10 depth + 100 trees
- NN: dense 64 layer with RELU

Get the test data

In [1]:
SEED = 123
PROJECT_PATH = ".."
from product_matcher.utils import get_config
from product_matcher.utils import loader, load_data
import os


cfg = get_config(overrides=['experiments=problem1/neuralnet'])
train, test, X_train, Y_train, X_test, Y_test = load_data(cfg, PROJECT_PATH)

2023-02-12 20:02:59.873375: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


1. decision tree

In [2]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=200, criterion='gini', random_state=SEED)
clf = clf.fit(X_train, Y_train)
clf_acc = clf.score(X_test, Y_test)

0.9507101673463648

In [4]:
test['clf_prediction'] = clf.predict(X_test)

2. SVM on 10,000 data

In [6]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc = svc.fit(X_train, Y_train)
svc_acc = svc.score(X_test, Y_test)

0.9620306567290113

In [14]:
test['svm_prediction'] = svc.predict(X_test)

3. AdaBoost

In [7]:
from sklearn.ensemble import AdaBoostClassifier
dt = tree.DecisionTreeClassifier(max_depth=10, criterion='gini', random_state=SEED)
ada = AdaBoostClassifier(dt, n_estimators=100)
ada = ada.fit(X_train, Y_train)
ada_acc = ada.score(X_test, Y_test)

0.9561946280410631

In [15]:
test['adaboost_prediction'] = ada.predict(X_test)

4. KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn = knn.fit(X_train, Y_train)
knn_acc = knn.score(X_test, Y_test)

0.9557727464491632

In [16]:
test['knn_prediction'] = knn.predict(X_test)

5. NeuralNet

In [17]:
import tensorflow as tf
x = tf.convert_to_tensor(X_train.todense())
x_test = tf.convert_to_tensor(X_test.todense())

2023-02-12 20:26:32.747759: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [63]:
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras import Sequential
from product_matcher.utils import timecallback

model = Sequential([
    InputLayer((x.shape[1], )),
    Dense(units=64, activation='relu'),
    Dense(units=1, activation='sigmoid')
])
earystop = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=4)
timer = timecallback()
# model compile
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             optimizer = "adam", 
             metrics = ["BinaryAccuracy", "Precision", "Recall"]
             )

history = model.fit(
    x=x, 
    y=Y_train, 
    batch_size=24,
    validation_split=.2,
    epochs = 10, 
    callbacks=[timer, earystop])

Epoch 1/10


  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [65]:
_, nn_acc, _, _ = model.evaluate(x_test, Y_test)



In [70]:
import numpy as np
test['nn_prediction'] = (model.predict(x_test)>.5).astype("int32")



In [75]:
import pandas as pd

test_report = pd.DataFrame.from_dict(
    {'decision tree':clf_acc, 'SVM': svc_acc, 'Ada Boost': ada_acc, 'KNN': knn_acc, "NeuralNet": nn_acc},
    orient='index',columns=['Test Accuracy']
)

In [76]:
from sklearn.metrics import recall_score, precision_score, f1_score

test_report['test recall'] = [
    recall_score(test['clf_prediction'], test['label']),
    recall_score(test['svm_prediction'], test['label']),
    recall_score(test['adaboost_prediction'], test['label']),
    recall_score(test['knn_prediction'], test['label']),
    recall_score(test['nn_prediction'], test['label'])
]

test_report['test precision'] = [
    precision_score(test['clf_prediction'], test['label']),
    precision_score(test['svm_prediction'], test['label']),
    precision_score(test['adaboost_prediction'], test['label']),
    precision_score(test['knn_prediction'], test['label']),
    precision_score(test['nn_prediction'], test['label'])
]
test_report['test f1 score'] = [
    f1_score(test['clf_prediction'], test['label']),
    f1_score(test['svm_prediction'], test['label']),
    f1_score(test['adaboost_prediction'], test['label']),
    f1_score(test['knn_prediction'], test['label']),
    f1_score(test['nn_prediction'], test['label'])
]

In [77]:
print(test_report.to_latex(caption="Best Estimator On Testing Data"))

\begin{table}
\centering
\caption{Best Estimator On Testing Data}
\begin{tabular}{lrrrr}
\toprule
{} &  Test Accuracy &  test recall &  test precision &  test f1 score \\
\midrule
decision tree &       0.950710 &     0.954922 &        0.940321 &       0.947565 \\
SVM           &       0.962031 &     0.956865 &        0.963257 &       0.960050 \\
Ada Boost     &       0.956195 &     0.951811 &        0.955909 &       0.953855 \\
KNN           &       0.955773 &     0.967754 &        0.937871 &       0.952578 \\
NeuralNet     &       0.960800 &     0.955272 &        0.962292 &       0.958769 \\
\bottomrule
\end{tabular}
\end{table}



  print(test_report.to_latex(caption="Best Estimator On Testing Data"))
