In [1]:
import os
import glob
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import KFold, GroupKFold, cross_val_score, train_test_split
import optuna

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
numFolds = 5
wiFiIDFeatureDir = "referencePublicNotebooks/waypt_WiFiID1000Feat/"
wiFiDtFeatureDir = "referencePublicNotebooks/waypt_WiFiDt1000Feat/"

In [4]:
# the metric used in this competition
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

# get our train and test files
wiFiIDFeatureTrainFiles = sorted(glob.glob(os.path.join(wiFiIDFeatureDir, 'train/*_train.csv')))
wiFiDtFeatureTrainFiles = sorted(glob.glob(os.path.join(wiFiDtFeatureDir, 'train/*_train.csv')))

In [5]:
len(wiFiIDFeatureTrainFiles), len(wiFiDtFeatureTrainFiles)

(24, 24)

In [6]:
e = 0
wiFiIDdf = pd.read_csv(wiFiIDFeatureTrainFiles[e])
wiFiDtdf = pd.read_csv(wiFiDtFeatureTrainFiles[e])

In [7]:
#wiFiIDdf.head(3)
#wiFiDtdf.head(3)

In [8]:
wiFiIDFeature = wiFiIDdf.iloc[:,:-4].to_numpy()
wiFiDtFeature = wiFiDtdf.to_numpy()

# normalize to 1
wiFiDtFeature = wiFiDtFeature.astype(float) / 1000.0
wiFiIDFeature = wiFiIDFeature.astype(float) / 999.0
x_train = np.column_stack((wiFiIDFeature,wiFiDtFeature))
print(x_train.shape)

y_trainy = wiFiIDdf.iloc[:,-3].values
y_trainx = wiFiIDdf.iloc[:,-4].values
y_trainf = wiFiIDdf.iloc[:,-2].values
print(wiFiDtFeature.shape, wiFiIDFeature.shape, y_trainx.shape, y_trainy.shape, y_trainf.shape)

(1974, 1882)
(1974, 941) (1974, 941) (1974,) (1974,) (1974,)


In [9]:
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).

def objective(trial):        
    train_x, valid_x, train_y, valid_y = train_test_split(x_train, y_trainy, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
    param = {
        "objective": "regression",
        "metric": "l2",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    accuracy = sklearn.metrics.mean_squared_error(valid_y, preds)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    
    ## increase n_trails to test more params
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


[32m[I 2021-03-28 15:10:00,945][0m A new study created in memory with name: no-name-36f3e848-bc5c-4a38-bf17-7fef200dd2c1[0m
[32m[I 2021-03-28 15:10:01,963][0m Trial 0 finished with value: 126.08038258224093 and parameters: {'lambda_l1': 5.942824626224296e-05, 'lambda_l2': 7.827627651264878e-05, 'n_estimators': 113, 'num_leaves': 38, 'feature_fraction': 0.6564770009785859, 'bagging_fraction': 0.43952028769020546, 'bagging_freq': 2, 'min_child_samples': 92}. Best is trial 0 with value: 126.08038258224093.[0m
[32m[I 2021-03-28 15:10:03,499][0m Trial 1 finished with value: 96.90642241766948 and parameters: {'lambda_l1': 3.6476037501311903e-07, 'lambda_l2': 2.3641936792361084e-08, 'n_estimators': 191, 'num_leaves': 167, 'feature_fraction': 0.5496740687344296, 'bagging_fraction': 0.4372940547247904, 'bagging_freq': 1, 'min_child_samples': 87}. Best is trial 1 with value: 96.90642241766948.[0m
[32m[I 2021-03-28 15:10:05,178][0m Trial 2 finished with value: 58.830684380230636 and pa

[32m[I 2021-03-28 15:10:59,779][0m Trial 13 finished with value: 40.22343677800098 and parameters: {'lambda_l1': 0.10406236562537848, 'lambda_l2': 0.15306812352881335, 'n_estimators': 139, 'num_leaves': 59, 'feature_fraction': 0.8986822417509279, 'bagging_fraction': 0.889994526173345, 'bagging_freq': 5, 'min_child_samples': 20}. Best is trial 3 with value: 34.676505385696004.[0m
[32m[I 2021-03-28 15:11:08,063][0m Trial 14 finished with value: 42.395807821452465 and parameters: {'lambda_l1': 0.009140861020077275, 'lambda_l2': 0.07905621213320396, 'n_estimators': 142, 'num_leaves': 39, 'feature_fraction': 0.9939308432950769, 'bagging_fraction': 0.9175147993358301, 'bagging_freq': 4, 'min_child_samples': 25}. Best is trial 3 with value: 34.676505385696004.[0m
[32m[I 2021-03-28 15:11:11,027][0m Trial 15 finished with value: 46.5943441827927 and parameters: {'lambda_l1': 9.484363562772606e-06, 'lambda_l2': 0.24271305996293363, 'n_estimators': 147, 'num_leaves': 11, 'feature_fraction

[32m[I 2021-03-28 15:12:13,289][0m Trial 26 finished with value: 43.39543209238769 and parameters: {'lambda_l1': 1.3596614179411656e-06, 'lambda_l2': 0.0006575447423491814, 'n_estimators': 126, 'num_leaves': 225, 'feature_fraction': 0.40802662768413694, 'bagging_fraction': 0.5358931659149582, 'bagging_freq': 7, 'min_child_samples': 10}. Best is trial 23 with value: 33.034451582100836.[0m
[32m[I 2021-03-28 15:12:28,719][0m Trial 27 finished with value: 48.20553083137827 and parameters: {'lambda_l1': 4.4709873896880765e-05, 'lambda_l2': 1.936360644367179e-05, 'n_estimators': 157, 'num_leaves': 192, 'feature_fraction': 0.6165835664504105, 'bagging_fraction': 0.6465525479918109, 'bagging_freq': 6, 'min_child_samples': 5}. Best is trial 23 with value: 33.034451582100836.[0m
[32m[I 2021-03-28 15:12:31,944][0m Trial 28 finished with value: 52.51392492188378 and parameters: {'lambda_l1': 1.4419347628773716e-08, 'lambda_l2': 0.0002018378205375628, 'n_estimators': 179, 'num_leaves': 105,

[32m[I 2021-03-28 15:13:33,621][0m Trial 39 finished with value: 125.22947395786073 and parameters: {'lambda_l1': 1.4422839025922627, 'lambda_l2': 1.1183594990915942e-06, 'n_estimators': 132, 'num_leaves': 161, 'feature_fraction': 0.7716364485539783, 'bagging_fraction': 0.46072940173811644, 'bagging_freq': 3, 'min_child_samples': 99}. Best is trial 23 with value: 33.034451582100836.[0m
[32m[I 2021-03-28 15:13:38,679][0m Trial 40 finished with value: 45.63035429561669 and parameters: {'lambda_l1': 1.9773358611003233e-05, 'lambda_l2': 1.1115400901901559e-08, 'n_estimators': 171, 'num_leaves': 235, 'feature_fraction': 0.6540002481517089, 'bagging_fraction': 0.8089611221163613, 'bagging_freq': 6, 'min_child_samples': 28}. Best is trial 23 with value: 33.034451582100836.[0m
[32m[I 2021-03-28 15:13:49,358][0m Trial 41 finished with value: 43.73942366448799 and parameters: {'lambda_l1': 0.0017827903895741026, 'lambda_l2': 1.0715529422427971e-06, 'n_estimators': 200, 'num_leaves': 134,

[32m[I 2021-03-28 15:15:32,112][0m Trial 52 finished with value: 53.79403087419138 and parameters: {'lambda_l1': 0.2977341539820253, 'lambda_l2': 1.650303994833352e-08, 'n_estimators': 171, 'num_leaves': 158, 'feature_fraction': 0.6451780803916141, 'bagging_fraction': 0.77928333401577, 'bagging_freq': 4, 'min_child_samples': 14}. Best is trial 23 with value: 33.034451582100836.[0m
[32m[I 2021-03-28 15:15:36,632][0m Trial 53 finished with value: 36.273307626617836 and parameters: {'lambda_l1': 9.225001704152328, 'lambda_l2': 0.00010945711089180251, 'n_estimators': 186, 'num_leaves': 132, 'feature_fraction': 0.6798042132585849, 'bagging_fraction': 0.84867908710731, 'bagging_freq': 2, 'min_child_samples': 18}. Best is trial 23 with value: 33.034451582100836.[0m
[32m[I 2021-03-28 15:15:41,017][0m Trial 54 finished with value: 42.83405708455499 and parameters: {'lambda_l1': 8.3737390773295, 'lambda_l2': 0.0005907426699982897, 'n_estimators': 188, 'num_leaves': 116, 'feature_fraction

[32m[I 2021-03-28 15:16:27,948][0m Trial 65 finished with value: 31.162408715775484 and parameters: {'lambda_l1': 9.95282546248911, 'lambda_l2': 0.8986085629363542, 'n_estimators': 159, 'num_leaves': 219, 'feature_fraction': 0.4991243487416595, 'bagging_fraction': 0.9722361152706888, 'bagging_freq': 4, 'min_child_samples': 22}. Best is trial 64 with value: 27.878290494803295.[0m
[32m[I 2021-03-28 15:16:31,443][0m Trial 66 finished with value: 43.616442431774516 and parameters: {'lambda_l1': 5.412405636460718, 'lambda_l2': 8.966513832464823, 'n_estimators': 158, 'num_leaves': 224, 'feature_fraction': 0.4946444343041586, 'bagging_fraction': 0.9704757536001017, 'bagging_freq': 4, 'min_child_samples': 20}. Best is trial 64 with value: 27.878290494803295.[0m
[32m[I 2021-03-28 15:16:35,309][0m Trial 67 finished with value: 34.92793287200234 and parameters: {'lambda_l1': 0.3896284270587631, 'lambda_l2': 0.9942592265026853, 'n_estimators': 147, 'num_leaves': 202, 'feature_fraction': 0.

[32m[I 2021-03-28 15:17:17,644][0m Trial 78 finished with value: 41.538618986485986 and parameters: {'lambda_l1': 8.982882453091175, 'lambda_l2': 0.6589483304724981, 'n_estimators': 121, 'num_leaves': 230, 'feature_fraction': 0.43251784880700267, 'bagging_fraction': 0.9623798011143436, 'bagging_freq': 3, 'min_child_samples': 23}. Best is trial 64 with value: 27.878290494803295.[0m
[32m[I 2021-03-28 15:17:21,446][0m Trial 79 finished with value: 39.199445360013854 and parameters: {'lambda_l1': 0.866442346329931, 'lambda_l2': 0.06279017674546021, 'n_estimators': 161, 'num_leaves': 197, 'feature_fraction': 0.41688755632177715, 'bagging_fraction': 0.9414252805838078, 'bagging_freq': 5, 'min_child_samples': 29}. Best is trial 64 with value: 27.878290494803295.[0m
[32m[I 2021-03-28 15:17:26,219][0m Trial 80 finished with value: 33.74844154691891 and parameters: {'lambda_l1': 0.4465614400849016, 'lambda_l2': 6.11350908159038, 'n_estimators': 178, 'num_leaves': 181, 'feature_fraction':

[32m[I 2021-03-28 15:18:59,143][0m Trial 91 finished with value: 36.451981044982865 and parameters: {'lambda_l1': 0.5087731031959953, 'lambda_l2': 0.029498360978572054, 'n_estimators': 175, 'num_leaves': 207, 'feature_fraction': 0.4570593257676514, 'bagging_fraction': 0.9663929648263105, 'bagging_freq': 3, 'min_child_samples': 13}. Best is trial 64 with value: 27.878290494803295.[0m
[32m[I 2021-03-28 15:19:05,818][0m Trial 92 finished with value: 37.840404871866895 and parameters: {'lambda_l1': 0.2406432823103384, 'lambda_l2': 0.06426443275405115, 'n_estimators': 184, 'num_leaves': 186, 'feature_fraction': 0.5470785745547022, 'bagging_fraction': 0.5134148574590351, 'bagging_freq': 4, 'min_child_samples': 11}. Best is trial 64 with value: 27.878290494803295.[0m
[32m[I 2021-03-28 15:19:12,061][0m Trial 93 finished with value: 34.89061061359665 and parameters: {'lambda_l1': 0.9470659622520361, 'lambda_l2': 0.13135425153113908, 'n_estimators': 189, 'num_leaves': 229, 'feature_fract

Number of finished trials: 100
Best trial:
  Value: 27.878290494803295
  Params: 
    lambda_l1: 3.24217057813122
    lambda_l2: 9.190559740695685
    n_estimators: 169
    num_leaves: 195
    feature_fraction: 0.4255054464173141
    bagging_fraction: 0.9902779974923297
    bagging_freq: 4
    min_child_samples: 14
