In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

In [3]:
ORG_DIR = '/kaggle/input/indoor-location-navigation'
DATA_DIR = '/kaggle/input/indoor-competition-dataset'

sub = pd.read_csv(ORG_DIR+"/sample_submission.csv")
train_files = [i for i in os.listdir(DATA_DIR) if "train" in i]
test_files = [i for i in os.listdir(DATA_DIR) if "test" in i]

all_sites = sub["site_path_timestamp"].apply(lambda x: x.split("_")[0]).tolist()

In [4]:
lgbm_params = {'objective': 'binary', 'metric': 'rmse', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.01, "num_leaves": 10, 
               'random_seed':44,'max_depth': 5, 'verbose': -1}

#for site in all_sites:
site = all_sites[0]
train_data = pd.read_csv(DATA_DIR+"/"+site+str("_train.csv"), index_col="path")
test_data = pd.read_csv(DATA_DIR+"/"+site+str("_test.csv"))

train_paths = train_data.index.unique()
non_targets = list(train_data.iloc[:,:-4].columns)

# predict floor first
X = train_data[non_targets+["f"]]
X_test = test_data[non_targets]

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=224)
valid = np.array([])
label = np.array([])
test_floor_pred = np.zeros(len(test_data))
for fold, (tr_group_idx, val_group_idx) in enumerate(kf.split(train_paths)):
    print("Fold:", fold+1)
    tr_paths, val_paths = train_paths[tr_group_idx], train_paths[val_group_idx]
    
    X_train = X[X.index.isin(tr_paths)].drop("f", axis=1).reset_index(drop=True)
    X_valid = X[X.index.isin(val_paths)].drop("f", axis=1).reset_index(drop=True)
    y_train = X[X.index.isin(tr_paths)]["f"].reset_index(drop=True)
    y_valid = X[X.index.isin(val_paths)]["f"].reset_index(drop=True)
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
    model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 1000)
    
    valid_pred = model.predict(X_valid, num_iteration = model.best_iteration)
    valid = np.concatenate([valid, valid_pred])
    label = np.concatenate([label, y_valid])
    
    test_floor_pred += model.predict(X_test, num_iteration = model.best_iteration) / n_folds
    
print(np.sqrt(mean_squared_error(label, valid)))

# predict x, y based on floor information


Fold: 1
Training until validation scores don't improve for 20 rounds
[1000]	training's rmse: 1.15807	valid_1's rmse: 1.06482
Early stopping, best iteration is:
[1868]	training's rmse: 1.15805	valid_1's rmse: 1.06475
Fold: 2
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[823]	training's rmse: 1.15257	valid_1's rmse: 1.08642
Fold: 3
Training until validation scores don't improve for 20 rounds
[1000]	training's rmse: 1.13661	valid_1's rmse: 1.14437
Early stopping, best iteration is:
[1198]	training's rmse: 1.13659	valid_1's rmse: 1.14433
Fold: 4
Training until validation scores don't improve for 20 rounds
[1000]	training's rmse: 1.08408	valid_1's rmse: 1.32305
Early stopping, best iteration is:
[1046]	training's rmse: 1.08407	valid_1's rmse: 1.32305
Fold: 5
Training until validation scores don't improve for 20 rounds
[1000]	training's rmse: 1.15682	valid_1's rmse: 1.05572
Early stopping, best iteration is:
[1503]	training's rmse: 1.15681	v

# submission