In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
import tqdm
import random
import pandas as pd

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [5]:
from preprocess import preprocess

X_train, X_test, y_train = preprocess(df_train, df_test, use_custom_target_encoding=True, use_scaling=True, \
                                                     filter_features=False)

In [6]:
objects_num = 50000

X = X_train[:objects_num]
y = y_train[:objects_num]

### Hyperopt

In [7]:
from hyperopt.pyll import scope as ho_scope
from hyperopt import fmin, tpe, hp

### BernoulliNB

In [8]:
iter_num = 250

In [9]:
def f(space):
    model = BernoulliNB(alpha=space['alpha'], fit_prior=space['fit_prior'])
    
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)


space = {
        'alpha': hp.loguniform('C', low=np.log(0.01), high=np.log(3)),
        'fit_prior':  hp.choice('fit_prior', [True, False]),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [00:51<00:00,  4.83it/s, best loss: 1.514992505200684]
Found minimum after 250 trials:
{'C': 1.5585468574616144, 'fit_prior': 1}


### Decision tree

In [10]:
def f(space):
    model = DecisionTreeClassifier(max_depth=space['max_depth'], max_features=space['max_features'],\
                                  criterion=space['criterion'], min_samples_split=space['min_samples_split'],\
                                  min_samples_leaf=space['min_samples_leaf'], min_weight_fraction_leaf = space['min_weight_fraction_leaf'])
    
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)
    
space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2']),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'min_samples_split': ho_scope.int(hp.quniform('min_samples_split', low=2, high=10, q=1)),
    'min_samples_leaf':  ho_scope.int(hp.quniform('min_samples_leaf', low=1, high=10, q=1)),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0, 0.5),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [00:44<00:00,  4.78it/s, best loss: 1.2444849111002736]
Found minimum after 250 trials:
{'criterion': 0, 'max_depth': 8, 'max_features': 0, 'min_samples_leaf': 6.0, 'min_samples_split': 9.0, 'min_weight_fraction_leaf': 0.0015932365030179585}


### Extra Tree Clasifier

In [11]:
def f(space):
    model = ExtraTreesClassifier(max_depth=space['max_depth'], max_features=space['max_features'],\
                                  criterion=space['criterion'], min_samples_split=space['min_samples_split'],\
                                  min_samples_leaf=space['min_samples_leaf'], min_weight_fraction_leaf = space['min_weight_fraction_leaf'])
    
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)
    
space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2']),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'min_samples_split': ho_scope.int(hp.quniform('min_samples_split', low=2, high=10, q=1)),
    'min_samples_leaf':  ho_scope.int(hp.quniform('min_samples_leaf', low=1, high=10, q=1)),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0, 0.5),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [01:18<00:00,  3.02it/s, best loss: 1.2959788757399178]
Found minimum after 250 trials:
{'criterion': 1, 'max_depth': 17, 'max_features': 1, 'min_samples_leaf': 7.0, 'min_samples_split': 8.0, 'min_weight_fraction_leaf': 0.0002173047975415648}


### Gaussian NB

In [12]:
def f(space):
    model = GaussianNB(var_smoothing=space['var_smoothing'])
    
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)
    
space = {
    'var_smoothing': hp.loguniform('var_smoothing', low=np.log(1e-10), high=np.log(1e-7))
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [00:39<00:00,  6.35it/s, best loss: 1.404756645801655]
Found minimum after 250 trials:
{'var_smoothing': 1.0002879974657864e-10}


### KNeighbors Classifier

In [13]:
objects_num = 10000

X = X_train[:objects_num]
y = y_train[:objects_num]

In [14]:
def f(space):
    model = KNeighborsClassifier(n_neighbors=space['n_neighbors'], weights=space['weights'],\
                                 algorithm=space['algorithm'], leaf_size=space['leaf_size'], p=space['p'])
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)

space = {
        'n_neighbors': ho_scope.int(hp.quniform('n_neighbors', low=2, high=10, q=1)),
        'weights':  hp.choice('weights', ['uniform', 'distance']),
        'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size': ho_scope.int(hp.quniform('leaf_size', low=4, high=60, q=2)),
        'p': hp.choice('p', [1, 2])
}
    
best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [18:43<00:00,  3.76s/it, best loss: 1.3102311357098289]
Found minimum after 250 trials:
{'algorithm': 2, 'leaf_size': 38.0, 'n_neighbors': 10.0, 'p': 0, 'weights': 1}


### RandomForest

In [15]:
objects_num = 50000

X = X_train[:objects_num]
y = y_train[:objects_num]

In [16]:
def f(space):
    model = RandomForestClassifier(max_depth=space['max_depth'], max_features=space['max_features'],\
                                  criterion=space['criterion'], min_samples_split=space['min_samples_split'],\
                                  min_samples_leaf=space['min_samples_leaf'], min_weight_fraction_leaf = space['min_weight_fraction_leaf'],\
                                  verbose=0)
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)
    
space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2']),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'min_samples_split': ho_scope.int(hp.quniform('min_samples_split', low=2, high=10, q=1)),
    'min_samples_leaf':  ho_scope.int(hp.quniform('min_samples_leaf', low=1, high=10, q=1)),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0, 0.5),
}

    
best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [04:44<00:00,  1.77s/it, best loss: 1.2105835000743927]
Found minimum after 250 trials:
{'criterion': 1, 'max_depth': 17, 'max_features': 0, 'min_samples_leaf': 7.0, 'min_samples_split': 7.0, 'min_weight_fraction_leaf': 0.00036030358779944667}


### Passive Aggressive

In [17]:
objects_num = 50000

X = X_train[:objects_num]
y = y_train[:objects_num]

In [18]:
def f(space):
    model = PassiveAggressiveClassifier(C=space['C'], fit_intercept=space['fit_intercept'])
    
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)
    
space = {
    'C': hp.loguniform('C', low=np.log(0.01), high=np.log(1)),
    'fit_intercept': hp.choice('fit_intercept', [True, False]),
}

    
best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

100%|██████████| 250/250 [06:08<00:00,  1.30s/it, best loss: 1.467211781761847] 
Found minimum after 250 trials:
{'C': 0.017342341509607916, 'fit_intercept': 1}


### SVC

In [19]:
objects_num = 5000

X = X_train[:objects_num]
y = y_train[:objects_num]

In [None]:
def f(space):
    model = SVC(C=space['C'], kernel=space['kernel'], degree=space['degree'], gamma=space['gamma'],\
               shrinking=space['shrinking'])
    
    rskf = StratifiedKFold(n_splits=3, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    return np.mean(rmse_scores)
    
space = {
    'C': hp.loguniform('C', low=np.log(0.01), high=np.log(1)),
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'degree':  ho_scope.int(hp.quniform('degree', low=2, high=5, q=1)),
    'gamma':  hp.loguniform('gamma', low=np.log(0.001), high=np.log(100)),
    'shrinking': hp.choice('shrinking', [True, False])
}

    
best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=iter_num  # Perform 100 trials
)

print("Found minimum after %d trials:" %(iter_num))
print(best)

 12%|█▏        | 31/250 [02:45<17:19,  4.75s/it, best loss: 1.4201169340760789]