In [22]:
import os
import pandas as pd
import json
from time import time
import numpy as np

from model import num2cate_fit, num2cate_transform, generate_samples, generate_model_samples, ModelGene, findKeyAttrs, FindGroups, get_numAttrs, find_rules
from model.samples import DataGene
from model.data_encoder import DataEncoder
from joblib import dump, load

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier

import dill

import warnings
warnings.filterwarnings('ignore')

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


def search_model(clf,param_dist, dataset_name):
    
    
    
    if os.path.isfile('{}_mdlp.pkl'.format(dataset_name)):
        f=open('{}_mdlp.pkl'.format(dataset_name), 'rb')
        mdlp = dill.load(f)
    else:
        raise Exception('no mdlp exists, run init_samples first')
        
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    data = num2cate_transform(data, mdlp) 
    encoder = DataEncoder()
    encoder.fit(data)
    X, y = encoder.transform(data)
   

    # train model
    
    
    # run randomized search
    n_iter_search = 40
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search, cv=5)

    start = time()
    random_search.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.cv_results_)
        
        

## try random forest

In [23]:
# random forest
clf = RandomForestClassifier( min_samples_leaf=1)
    
# specify parameters and distributions to sample from
param_dist = {"max_depth": sp_randint(4, 11),
          "n_estimators":[20, 50, 100, 200],
          "max_features": ['auto', 1, 3, 5],
          "min_samples_split": sp_randint(2, 11),
          "bootstrap": [True, False],
          "criterion": ["gini", "entropy"]}

search_model(clf, param_dist, 'academic')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cats[column] = encoder.transform(cats[column])


RandomizedSearchCV took 13.60 seconds for 40 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.904 (std: 0.054)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': 5, 'min_samples_split': 9, 'n_estimators': 50}

Model with rank: 1
Mean validation score: 0.904 (std: 0.043)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 4, 'max_features': 3, 'min_samples_split': 2, 'n_estimators': 100}

Model with rank: 3
Mean validation score: 0.902 (std: 0.057)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 4, 'max_features': 5, 'min_samples_split': 7, 'n_estimators': 50}





## try lr

In [27]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
param_dist = {"solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
          "penalty" : ['l2'], 
          "C": [0.1, 0.2, 0.5, 1]}

search_model(clf, param_dist, 'academic')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cats[column] = encoder.transform(cats[column])


RandomizedSearchCV took 0.62 seconds for 40 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.904 (std: 0.034)
Parameters: {'solver': 'sag', 'penalty': 'l2', 'C': 0.5}

Model with rank: 1
Mean validation score: 0.904 (std: 0.034)
Parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.5}

Model with rank: 3
Mean validation score: 0.902 (std: 0.030)
Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.1}

Model with rank: 3
Mean validation score: 0.902 (std: 0.037)
Parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.5}

Model with rank: 3
Mean validation score: 0.902 (std: 0.037)
Parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.5}



### try xgboost

In [30]:
from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=200, max_depth=5 )
param_dist = {"eta" : [0.1, 0.2, 0.5, 0.7 ,1 ], 
          "gamma" : [0, 0.1, 0.3, 0.6, 1, 4, 8, 16],
          "max_depth": sp_randint(2, 10),
          "tree_method": ["auto", "exact", "approx", "hist"]
             }

search_model(clf, param_dist, 'academic')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cats[column] = encoder.transform(cats[column])


[19:39:49] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:49] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[19:39:54] Tree meth



In [34]:
from sklearn.svm import SVC
clf = SVC(kernel="rbf", gamma='scale')
param_dist = {
    "kernel":["linear", "poly", "rbf", "sigmoid"] ,
    "gamma": ["scale", "auto"]
}
search_model(clf, param_dist, 'academic')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cats[column] = encoder.transform(cats[column])


RandomizedSearchCV took 0.15 seconds for 40 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.894 (std: 0.028)
Parameters: {'kernel': 'linear', 'gamma': 'scale'}

Model with rank: 1
Mean validation score: 0.894 (std: 0.028)
Parameters: {'kernel': 'linear', 'gamma': 'auto'}

Model with rank: 3
Mean validation score: 0.892 (std: 0.052)
Parameters: {'kernel': 'poly', 'gamma': 'scale'}

Model with rank: 3
Mean validation score: 0.892 (std: 0.052)
Parameters: {'kernel': 'poly', 'gamma': 'auto'}





In [35]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
param_dist = {
    "n_neighbors": sp_randint(2, 20),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": sp_randint(2,40)
}
search_model(clf, param_dist, 'academic')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cats[column] = encoder.transform(cats[column])


RandomizedSearchCV took 2.20 seconds for 40 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.890 (std: 0.063)
Parameters: {'algorithm': 'brute', 'leaf_size': 17, 'n_neighbors': 16, 'weights': 'distance'}

Model with rank: 1
Mean validation score: 0.890 (std: 0.063)
Parameters: {'algorithm': 'brute', 'leaf_size': 16, 'n_neighbors': 16, 'weights': 'distance'}

Model with rank: 3
Mean validation score: 0.887 (std: 0.074)
Parameters: {'algorithm': 'ball_tree', 'leaf_size': 29, 'n_neighbors': 12, 'weights': 'distance'}





In [38]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": sp_randint(2, 10),
    "min_samples_split": sp_randint(2,4)
}

search_model(clf, param_dist, 'academic')


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cats[column] = encoder.transform(cats[column])


RandomizedSearchCV took 0.24 seconds for 40 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.898 (std: 0.050)
Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.890 (std: 0.049)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.890 (std: 0.064)
Parameters: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.890 (std: 0.064)
Parameters: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.890 (std: 0.055)
Parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.890 (std: 0.036)
Parameters: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 2}



