# Summary
This notebook is for experiments on 
- Decision Tree
- SVM
- KNN
- Ada-Boost

In [1]:
SEED = 123
PROJECT_PATH = ".."

In [2]:
from product_matcher.utils import get_config
from product_matcher.utils import loader
import os


In [79]:
cfg = get_config()

train = loader(
    os.path.join(
        PROJECT_PATH, 
        cfg['artifacts']['training_data']['name']), 
    cfg['artifacts']['training_data']['format']
)
test = loader(
    os.path.join(
        PROJECT_PATH, 
        cfg['artifacts']['testing_data']['name']), 
    cfg['artifacts']['testing_data']['format']
)

X_train = loader(
    os.path.join(
        PROJECT_PATH, 
        cfg['artifacts']['training_features']['name']), 
    cfg['artifacts']['training_features']['format']
)
Y_train = train['label'].values

vectorizer = loader(
    os.path.join(
        PROJECT_PATH, 
        cfg['artifacts']['tfidf_vectorizer']['name']), 
    cfg['artifacts']['tfidf_vectorizer']['format']
)

In [4]:
X_test = vectorizer.transform(test['TITLE_CLEANED'])
Y_test = test['label'].values

In [5]:
len(train), len(test), X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

(85332, 28444, (85332, 22966), (28444, 22966), (85332,), (28444,))

# Decision Tree

- Hyper parameters
    - Pruning
        - Pre-pruning
            - Max_depth
        - Post-pruning
            - CCP_alpha
    - Training size
        - [100, 500, 1000, 5000, 8000, 10000, 30000, 50000, 80000]

In [6]:
from product_matcher.utils import *

In [7]:
# Hyperparameters
MAX_DEPTHS = [10, 50, 100, 150, 200]

CRITERION = ['gini', 'entropy', 'log_loss']

In [80]:
cfg = get_config(overrides=['experiments=problem1/decisiontree'])
param_grid = {
    'max_depth': cfg['experiments']['model']['max_depth'],
    'criterion': cfg['experiments']['model']['criterion']
}
TRAINING_SIZE = cfg['training_sizes']['problem1']

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.metrics import accuracy_score, f1_score

cv_results=dict()
for training_size in TRAINING_SIZE:
    print(f"training size: {training_size}")
    x,y = get_train(X_train, Y_train, training_size=training_size)
    clf = tree.DecisionTreeClassifier()
    gscv = GridSearchCV(clf, 
                        param_grid=param_grid, 
                        scoring=['accuracy', 'f1', 'recall', 'precision'], 
                        refit=False, 
                        cv=10, 
                        return_train_score=True)
    gscv.fit(x,y)
    _report = gscv.cv_results_
    _report['training_size'] = training_size
    cv_results[training_size] = _report

training size: 100
training size: 500
training size: 1000
training size: 5000
training size: 8000
training size: 10000
training size: 30000
training size: 50000
training size: 80000


In [70]:
pickle.dump(cv_results, open(os.path.join(PROJECT_PATH, cfg['artifacts']['hyperparameter_study']['name']), "wb"))

# SVM
- Support Vector Machines.
    - kernel functions

In [94]:
import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

In [95]:
from sklearn.svm import SVC
from sklearn.kernel_approximation import Nystroem

cfg = get_config(overrides=["experiments=problem1/svm"])
param_grid = {'kernel': cfg['experiments']['model']['kernel']}
TRAINING_SIZE = cfg['experiments']['data']['train_size']

cv_results=dict()
for training_size in TRAINING_SIZE:
    print(f"training size: {training_size}")
    x,y = get_train(X_train, Y_train, training_size=training_size)
    svc = SVC()
    gscv = GridSearchCV(svc, 
                        param_grid=param_grid, 
                        scoring=['accuracy', 'f1', 'recall', 'precision'], 
                        refit=False, 
                        cv=10, 
                        return_train_score=True)
    gscv.fit(x,y)
    _report = gscv.cv_results_
    _report['training_size'] = training_size
    cv_results[training_size] = _report

training size: 100
training size: 500
training size: 1000
training size: 5000
training size: 8000
training size: 10000
training size: 30000


KeyboardInterrupt: 

In [96]:
cfg['artifacts']['hyperparameter_study']['name']

'artifacts/problem1/svm_hyperparameter_study.pickle'

In [102]:
pickle.dump(cv_results, open(os.path.join(PROJECT_PATH, cfg['artifacts']['hyperparameter_study']['name']), "wb"))

 **TODO**: Training larget samples with Kernel Approximation - Nystroem Method for kernel approximation with SGDClassifier

# NeuralNet

# KNN

- k-Nearest Neighbors. 

    - Use different values of k.

In [103]:
from sklearn.neighbors import KNeighborsClassifier

cfg = get_config(overrides=["experiments=problem1/knn"])
param_grid = {
    'n_neighbors': cfg['experiments']['model']['k'], 
    'weights': cfg['experiments']['model']['weights']
             }
TRAINING_SIZE = cfg['experiments']['data']['train_size']

cv_results=dict()
for training_size in TRAINING_SIZE:
    print(f"training size: {training_size}")
    x,y = get_train(X_train, Y_train, training_size=training_size)
    knn = KNeighborsClassifier(n_jobs=2)
    gscv = GridSearchCV(knn, 
                        param_grid=param_grid, 
                        scoring=['accuracy', 'f1', 'recall', 'precision'], 
                        refit=False, 
                        cv=10, 
                        return_train_score=True)
    gscv.fit(x,y)
    _report = gscv.cv_results_
    _report['training_size'] = training_size
    cv_results[training_size] = _report

training size: 100
training size: 500
training size: 1000
training size: 5000
training size: 8000
training size: 10000
training size: 30000
training size: 50000
training size: 80000


In [108]:
cfg = get_config(overrides=["experiments=problem1/knn"])

In [109]:
cfg['artifacts']['hyperparameter_study']['name']

'artifacts/problem1/knn_hyperparameter_study.pickle'

In [110]:
pickle.dump(cv_results, open(os.path.join(PROJECT_PATH, cfg['artifacts']['hyperparameter_study']['name']), "wb"))

# Boosting

- Boosting. 
    - Different trees
        - DecisionTree
        - SVC
    - Purning
    - learning rate

In [129]:
from sklearn.ensemble import AdaBoostClassifier
from hydra.utils import instantiate

cfg = get_config(overrides=["experiments=problem1/boosting"])
param_grid = {
    'estimator': instantiate(cfg['experiments']['estimators']),
    'n_estimators': cfg['experiments']['n_estimators']
             }
TRAINING_SIZE = cfg['experiments']['data']['train_size']

cv_results=dict()
for training_size in TRAINING_SIZE:
    print(f"training size: {training_size}")
    x,y = get_train(X_train, Y_train, training_size=training_size)
    adabooster = AdaBoostClassifier()
    gscv = GridSearchCV(adabooster, 
                        param_grid=param_grid, 
                        scoring=['accuracy', 'f1', 'recall', 'precision'], 
                        refit=False, 
                        cv=10, 
                        return_train_score=True)
    gscv.fit(x,y)
    _report = gscv.cv_results_
    _report['training_size'] = training_size
    cv_results[training_size] = _report

training size: 100
training size: 500
training size: 1000
training size: 5000
training size: 8000
training size: 10000
training size: 30000
training size: 50000
training size: 80000


In [130]:
cfg['artifacts']['hyperparameter_study']['name']

'artifacts/problem1/adaboosting_hyperparameter_study.pickle'

In [131]:
pickle.dump(cv_results, open(os.path.join(PROJECT_PATH, cfg['artifacts']['hyperparameter_study']['name']), "wb"))