In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./kaggle/input'):
#for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./kaggle/input/Kannada-MNIST/train.csv
./kaggle/input/Kannada-MNIST/sample_submission.csv
./kaggle/input/Kannada-MNIST/test.csv
./kaggle/input/Kannada-MNIST/Dig-MNIST.csv


# Loading

In [2]:
train_data = pd.read_csv('./kaggle/input/Kannada-MNIST/train.csv')
valid_data = pd.read_csv('./kaggle/input/Kannada-MNIST/Dig-MNIST.csv')

In [3]:
import random
random.seed(32)
np.random.seed(42)
train_data = train_data.sample(frac=1,random_state=52).reset_index(drop=True)
valid_data = valid_data.sample(frac=1,random_state=62).reset_index(drop=True)
X_train, y_train = (train_data.drop(['label'], axis=1), train_data.label)
X_valid, y_valid = (valid_data.drop(['label'], axis=1), valid_data.label)
#X_train = X_train/255. # We don't have to normalize our data
#X_valid = X_valid/255. # because https://datascience.stackexchange.com/questions/60950/is-it-necessary-to-normalize-data-for-xgboost 

In [4]:
len(X_train), len(X_valid)

(60000, 10240)

In [5]:
import time

In [6]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.metrics import accuracy_score
import random

# Hyperparameter tuning

I used https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

My idea is to use a multi-phase random training.

1. Train a lot of models on a small training set (random set of the original train.csv) ---> Evaluate them on the validation set (Dig-MNIST.csv) ---> save the best ones
2. These define a narrower range of parameters. We'll train a lot of models (but less then in the 1st step) on the medium-size dataset. ---> Evaluate them on the validation set (Dig-MNIST.csv) ---> save the best ones
3. These define a narrower range of parameters. We'll train some models on this parameter-set randomly again and choose the best model for the competition.


We'll tune the following parameters:

* `max_depth`
* `colsample_bytree`
* `n_estimators`
* `learning_rate`
* `subsample`
* `reg_lambda`

In [8]:
def train(train_set_size,max_depth,colsample_bytree,n_estimators,learning_rate,subsample,reg_lambda):
    """
    Train an XGBoost classifier with these parameters and returns the trained model
    """
    #start = time.time()
    if train_set_size<1.0:
        train_data_sampled = train_data.sample(frac=train_set_size).reset_index(drop=True)
    else:
        train_data_sampled = train_data.copy()
    #print(len(train_data_sampled))
    X_train_sampled, y_train_sampled = (train_data_sampled.drop(['label'], axis=1), train_data_sampled.label)
    
    clf = XGBClassifier(use_label_encoder = False,
                        eval_metric = 'mlogloss',
                        num_class = 10,
                        max_depth = max_depth, 
                        colsample_bytree = colsample_bytree,
                        n_estimators = n_estimators,
                        learning_rate = learning_rate,
                        subsample = subsample,
                        reg_lambda = reg_lambda,
                       )
    clf.fit(X_train_sampled,y_train_sampled)
    #end = time.time()
    #print(f"T: {end-start:.2f}s")
    return clf
    

In [9]:
start = time.time()
model = train(train_set_size = 1.0,
              max_depth=5,
              colsample_bytree = 1.0,
              n_estimators = 10,
              learning_rate = 0.2,
              subsample = 1.0,
              reg_lambda = 100)
end = time.time()
print(f"Time: {end-start:.2f}s")             

Time: 32.22s


In [10]:
start = time.time()
y_pred = model.predict(X_valid)
end = time.time()
print(f"Time: {end-start:.2f}s")

print(accuracy_score(y_valid, y_pred))

Time: 0.12s
0.57998046875


## 1st round

In [11]:
MAX_PARAM_NUM_1 = 20
MAX_PARAM_NUM_1_BEST = 5
parameters_1 = pd.DataFrame()
np.random.seed(112)
for i in range(MAX_PARAM_NUM_1):
    max_depth = np.random.randint(3,21)
    n_estimators = np.random.randint(20,200)
    learning_rate = np.random.rand()*(0.4-0.01)+0.01
    colsample_bytree = np.random.rand()*(1.0-0.1)+0.1
    subsample = np.random.rand()*(1.0-0.1)+0.1
    reg_lambda = np.random.rand()*100.0
    parameters_1 = parameters_1.append({"max_depth":max_depth,
                                        "n_estimators":n_estimators,
                                        "learning_rate":learning_rate,
                                        "colsample_bytree":colsample_bytree,
                                        "subsample":subsample,
                                        "reg_lambda":reg_lambda,
                                        "valid_acc":i},ignore_index=True)
parameters_1["max_depth"] = parameters_1["max_depth"].astype(int)
parameters_1["n_estimators"] = parameters_1["n_estimators"].astype(int)

In [12]:
valid_accs = []
start = time.time()
for i in range(MAX_PARAM_NUM_1):
    params = parameters_1.iloc[i]
    max_depth = int(params["max_depth"])
    n_estimators = int(params["n_estimators"])
    learning_rate = params["learning_rate"]
    colsample_bytree = params["colsample_bytree"]
    subsample = params["subsample"]
    reg_lambda = params["reg_lambda"]
    #print(f"{i+1}\n----\n")
    #print(f"max_depth {max_depth}")
    #print(f"n_estimators {n_estimators}")
    #print(f"learning_rate {learning_rate}")
    #print(f"colsample_bytree {colsample_bytree}")
    #print(f"subsample {subsample}")
    #print(f"reg_lambda {reg_lambda}")
    model = train(train_set_size = 0.05,
                  max_depth = max_depth,
                  colsample_bytree = colsample_bytree,
                  n_estimators = n_estimators,
                  learning_rate = learning_rate,
                  subsample = subsample,
                  reg_lambda = reg_lambda)
    y_pred = model.predict(X_valid)
    valid_accs.append(accuracy_score(y_valid, y_pred))
    end = time.time()
    print(f"[{i+1}/{MAX_PARAM_NUM_1}] VA: {accuracy_score(y_valid, y_pred):.4f} | ET: {end-start:.2f}s")
parameters_1["valid_acc"]=valid_accs

[1/20] VA: 0.5637 | ET: 19.24s
[2/20] VA: 0.5792 | ET: 32.63s
[3/20] VA: 0.5900 | ET: 39.32s
[4/20] VA: 0.6070 | ET: 50.20s
[5/20] VA: 0.5699 | ET: 79.64s
[6/20] VA: 0.5752 | ET: 93.11s
[7/20] VA: 0.5704 | ET: 106.92s
[8/20] VA: 0.4974 | ET: 108.68s
[9/20] VA: 0.5401 | ET: 112.82s
[10/20] VA: 0.5297 | ET: 120.60s
[11/20] VA: 0.5854 | ET: 124.95s
[12/20] VA: 0.5665 | ET: 129.77s
[13/20] VA: 0.5456 | ET: 131.85s
[14/20] VA: 0.5807 | ET: 141.84s
[15/20] VA: 0.5665 | ET: 145.40s
[16/20] VA: 0.5634 | ET: 153.33s
[17/20] VA: 0.5073 | ET: 157.34s
[18/20] VA: 0.5375 | ET: 162.40s
[19/20] VA: 0.5732 | ET: 188.17s
[20/20] VA: 0.5854 | ET: 197.34s


In [13]:
parameters_1

Unnamed: 0,max_depth,n_estimators,learning_rate,colsample_bytree,subsample,reg_lambda,valid_acc
0,7,191,0.259719,0.955015,0.168109,77.692966,0.563672
1,12,190,0.377405,0.525297,0.265613,50.927594,0.579199
2,19,71,0.310659,0.208061,0.90689,62.665927,0.590039
3,20,94,0.177366,0.454016,0.939993,7.362099,0.607031
4,19,116,0.10814,0.879084,0.589585,63.126719,0.569922
5,5,149,0.337292,0.738425,0.269561,98.566856,0.575195
6,16,59,0.20623,0.787177,0.715359,72.195738,0.57041
7,12,26,0.089378,0.535771,0.135331,56.502985,0.497363
8,16,127,0.041406,0.155487,0.195553,51.383707,0.540137
9,10,59,0.054733,0.546554,0.439515,97.906001,0.529687


In [14]:
min_parameters_1 = parameters_1.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_1_BEST).min()
max_parameters_1 = parameters_1.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_1_BEST).max()
parameters_1_domain = parameters_1.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_1_BEST)


In [15]:
min_parameters_1

max_depth            7.000000
n_estimators        71.000000
learning_rate        0.177366
colsample_bytree     0.110609
subsample            0.242747
reg_lambda           7.362099
valid_acc            0.580664
dtype: float64

In [16]:
max_parameters_1

max_depth            20.000000
n_estimators        179.000000
learning_rate         0.310659
colsample_bytree      0.587410
subsample             0.939993
reg_lambda           62.665927
valid_acc             0.607031
dtype: float64

In [17]:
parameters_1_domain

Unnamed: 0,max_depth,n_estimators,learning_rate,colsample_bytree,subsample,reg_lambda,valid_acc
3,20,94,0.177366,0.454016,0.939993,7.362099,0.607031
2,19,71,0.310659,0.208061,0.90689,62.665927,0.590039
10,7,120,0.282394,0.110609,0.906622,22.739025,0.585449
19,13,179,0.289427,0.173926,0.75122,44.074518,0.585449
13,8,128,0.309773,0.58741,0.242747,46.389986,0.580664


In [32]:
np.random.choice(parameters_1_domain["max_depth"].values)

8

3     20
2     19
10     7
19    13
13     8
Name: max_depth, dtype: int64

## 2nd round

In [35]:
MAX_PARAM_NUM_2 = 20
MAX_PARAM_NUM_2_BEST = 5
parameters_2 = pd.DataFrame()
for i in range(MAX_PARAM_NUM_2):
    
    
    max_depth = np.random.choice(parameters_1_domain["max_depth"].values)
    n_estimators = np.random.choice(parameters_1_domain["n_estimators"].values)
    learning_rate = np.random.choice(parameters_1_domain["learning_rate"].values)
    colsample_bytree = np.random.choice(parameters_1_domain["colsample_bytree"].values)
    subsample = np.random.choice(parameters_1_domain["subsample"].values)
    reg_lambda = np.random.choice(parameters_1_domain["reg_lambda"].values)
    
    parameters_2 = parameters_2.append({"max_depth":max_depth,
                                        "n_estimators":n_estimators,
                                        "learning_rate":learning_rate,
                                        "colsample_bytree":colsample_bytree,
                                        "subsample":subsample,
                                        "reg_lambda":reg_lambda,
                                        "valid_acc":i},ignore_index=True)
parameters_2["max_depth"] = parameters_2["max_depth"].astype(int)
parameters_2["n_estimators"] = parameters_2["n_estimators"].astype(int)

In [36]:
parameters_2

Unnamed: 0,max_depth,n_estimators,learning_rate,colsample_bytree,subsample,reg_lambda,valid_acc
0,13,179,0.289427,0.454016,0.242747,62.665927,0.0
1,8,128,0.282394,0.454016,0.90689,46.389986,1.0
2,7,179,0.310659,0.454016,0.75122,46.389986,2.0
3,20,120,0.282394,0.58741,0.90689,22.739025,3.0
4,20,94,0.310659,0.454016,0.90689,46.389986,4.0
5,19,94,0.309773,0.208061,0.242747,7.362099,5.0
6,7,120,0.309773,0.454016,0.939993,46.389986,6.0
7,8,94,0.310659,0.58741,0.75122,44.074518,7.0
8,7,71,0.309773,0.173926,0.242747,62.665927,8.0
9,8,179,0.309773,0.58741,0.906622,62.665927,9.0


In [37]:
valid_accs = []
start = time.time()
for i in range(MAX_PARAM_NUM_2):
    params = parameters_2.iloc[i]
    max_depth = int(params["max_depth"])
    n_estimators = int(params["n_estimators"])
    learning_rate = params["learning_rate"]
    colsample_bytree = params["colsample_bytree"]
    subsample = params["subsample"]
    reg_lambda = params["reg_lambda"]
    #print(f"{i+1}\n----\n")
    #print(f"max_depth {max_depth}")
    #print(f"n_estimators {n_estimators}")
    #print(f"learning_rate {learning_rate}")
    #print(f"colsample_bytree {colsample_bytree}")
    #print(f"subsample {subsample}")
    #print(f"reg_lambda {reg_lambda}")
    model = train(train_set_size = 0.15,
                  max_depth = max_depth,
                  colsample_bytree = colsample_bytree,
                  n_estimators = n_estimators,
                  learning_rate = learning_rate,
                  subsample = subsample,
                  reg_lambda = reg_lambda)
    y_pred = model.predict(X_valid)
    valid_accs.append(accuracy_score(y_valid, y_pred))
    end = time.time()
    print(f"[{i+1}/{MAX_PARAM_NUM_2}] VA: {accuracy_score(y_valid, y_pred):.4f} | ET: {end-start:.2f}s")
parameters_2["valid_acc"]=valid_accs

[1/20] VA: 0.6086 | ET: 45.43s
[2/20] VA: 0.6288 | ET: 95.39s
[3/20] VA: 0.6208 | ET: 153.17s
[4/20] VA: 0.6259 | ET: 214.70s
[5/20] VA: 0.6050 | ET: 254.88s
[6/20] VA: 0.5952 | ET: 265.61s
[7/20] VA: 0.6243 | ET: 301.41s
[8/20] VA: 0.6174 | ET: 341.37s
[9/20] VA: 0.5769 | ET: 348.53s
[10/20] VA: 0.6236 | ET: 420.42s
[11/20] VA: 0.6138 | ET: 452.58s
[12/20] VA: 0.6098 | ET: 470.25s
[13/20] VA: 0.6126 | ET: 487.35s
[14/20] VA: 0.5985 | ET: 498.87s
[15/20] VA: 0.5698 | ET: 505.42s
[16/20] VA: 0.6162 | ET: 550.66s
[17/20] VA: 0.6175 | ET: 586.95s
[18/20] VA: 0.6148 | ET: 639.78s
[19/20] VA: 0.6271 | ET: 681.88s
[20/20] VA: 0.6119 | ET: 709.75s


In [38]:
min_parameters_2 = parameters_2.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_2_BEST).min()
max_parameters_2 = parameters_2.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_2_BEST).max()

In [39]:
min_parameters_2

max_depth             7.000000
n_estimators        120.000000
learning_rate         0.282394
colsample_bytree      0.454016
subsample             0.751220
reg_lambda            7.362099
valid_acc             0.623633
dtype: float64

In [40]:
max_parameters_2

max_depth            20.000000
n_estimators        179.000000
learning_rate         0.309773
colsample_bytree      0.587410
subsample             0.939993
reg_lambda           62.665927
valid_acc             0.628809
dtype: float64

In [41]:
parameters_2_domain = parameters_2.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_2_BEST)


In [43]:
parameters_2_domain

Unnamed: 0,max_depth,n_estimators,learning_rate,colsample_bytree,subsample,reg_lambda,valid_acc
1,8,128,0.282394,0.454016,0.90689,46.389986,0.628809
18,7,179,0.309773,0.454016,0.75122,7.362099,0.627051
3,20,120,0.282394,0.58741,0.90689,22.739025,0.625879
6,7,120,0.309773,0.454016,0.939993,46.389986,0.624316
9,8,179,0.309773,0.58741,0.906622,62.665927,0.623633


## 3rd round

In [42]:
MAX_PARAM_NUM_3 = 10
MAX_PARAM_NUM_3_BEST = 10
parameters_3 = pd.DataFrame()
for i in range(MAX_PARAM_NUM_3):
    max_depth = np.random.choice(parameters_2_domain["max_depth"].values)
    n_estimators = np.random.choice(parameters_2_domain["n_estimators"].values)
    learning_rate = np.random.choice(parameters_2_domain["learning_rate"].values)
    colsample_bytree = np.random.choice(parameters_2_domain["colsample_bytree"].values)
    subsample = np.random.choice(parameters_2_domain["subsample"].values)
    reg_lambda = np.random.choice(parameters_2_domain["reg_lambda"].values)
    
    parameters_3 = parameters_3.append({"max_depth":max_depth,
                                        "n_estimators":n_estimators,
                                        "learning_rate":learning_rate,
                                        "colsample_bytree":colsample_bytree,
                                        "subsample":subsample,
                                        "reg_lambda":reg_lambda,
                                        "valid_acc":i},ignore_index=True)
parameters_3["max_depth"] = parameters_3["max_depth"].astype(int)
parameters_3["n_estimators"] = parameters_3["n_estimators"].astype(int)

In [None]:
parameters_3

In [None]:
valid_accs = []
start = time.time()
models = []
for i in range(MAX_PARAM_NUM_3):
    params = parameters_3.iloc[i]
    max_depth = int(params["max_depth"])
    n_estimators = int(params["n_estimators"])
    learning_rate = params["learning_rate"]
    colsample_bytree = params["colsample_bytree"]
    subsample = params["subsample"]
    reg_lambda = params["reg_lambda"]
    #print(f"{i+1}\n----\n")
    #print(f"max_depth {max_depth}")
    #print(f"n_estimators {n_estimators}")
    #print(f"learning_rate {learning_rate}")
    #print(f"colsample_bytree {colsample_bytree}")
    #print(f"subsample {subsample}")
    #print(f"reg_lambda {reg_lambda}")
    model = train(train_set_size = 1.0,
                  max_depth = max_depth,
                  colsample_bytree = colsample_bytree,
                  n_estimators = n_estimators,
                  learning_rate = learning_rate,
                  subsample = subsample,
                  reg_lambda = reg_lambda)
    models.append(model)
    y_pred = model.predict(X_valid)
    valid_accs.append(accuracy_score(y_valid, y_pred))
    end = time.time()
    print(f"[{i+1}/{MAX_PARAM_NUM_3}] VA: {accuracy_score(y_valid, y_pred):.4f} | ET: {end-start:.2f}s")
parameters_3["valid_acc"]=valid_accs

In [None]:
parameters_3.sort_values("valid_acc",ascending=False)

In [None]:
parameters_3

In [None]:
max_ind = parameters_3["valid_acc"].argmax()

In [None]:
clf = models[max_ind]

In [None]:
y_pred = clf.predict(X_valid)

In [None]:
accuracy_score(y_valid, y_pred)

# Submission

In [None]:
test_data = pd.read_csv('./kaggle/input/Kannada-MNIST/test.csv')
#test_data = pd.read_csv('/kaggle/input/Kannada-MNIST/test.csv')

In [None]:
ids, test_set = test_data.id, test_data.drop(['id'], axis=1)

In [None]:
final_preds = clf.predict(test_set)

In [None]:
pd.Series(final_preds, index=ids, name='label').to_csv('./kaggle/working/submission.csv')
#pd.Series(final_preds, index=ids, name='label').to_csv('/kaggle/working/submission.csv')