- renew dataset (version 5: bssid more than 100 and timestamp, additional 25 files)
- change fold to 10

In [1]:
import os
import warnings
import numpy as np 
import pandas as pd
import lightgbm as lgb
from pathlib import Path
import scipy.stats as stats
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score

warnings.filterwarnings("ignore")

In [2]:
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum() / xhat.shape[0]

In [3]:
ORG_DIR = '/kaggle/input/indoor-location-navigation'
DATA_DIR = '/kaggle/input/indoor-competition-dataset'

sub = pd.read_csv(ORG_DIR+"/sample_submission.csv")
train_files = [i for i in os.listdir(DATA_DIR) if "train" in i]
test_files = [i for i in os.listdir(DATA_DIR) if "test" in i]

all_sites = sorted(set(sub["site_path_timestamp"].apply(lambda x: x.split("_")[0])))
sub = sub.set_index("site_path_timestamp")

# modelling

In [4]:
LOG_PATH = Path("./log/")
LOG_PATH.mkdir(parents=True, exist_ok=True)

def score_log(df: pd.DataFrame, num_files: int, nam_file: str, data_shape: tuple, n_fold: int, seed: int, mpe: float):
    score_dict = {'n_files': num_files, 'file_name': nam_file, 'shape': data_shape, 'fold': n_fold, 'seed': seed, 'score': mpe}
    # noinspection PyTypeChecker
    df = pd.concat([df, pd.DataFrame.from_dict([score_dict])])
    df.to_csv(LOG_PATH / f"log_score.csv", index=False)
    return df

#score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, fold, SEED, score)

In [5]:
lgbm_params = {'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 30, 
               'random_seed':44, 'verbose': -1}

lgbm_params_f = {'objective': 'multiclass', 'metric': 'multi_logloss',
                     'boosting_type': 'gbdt','tree_learner': 'serial',
                     'learning_rate': 0.1, "num_leaves": 30, 
               'random_seed':44, 'n_estimators': 5000, 'verbose': -1}
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=224)
predictions = list()

all_f_pred = np.array([])
all_x_pred = np.array([])
all_y_pred = np.array([])
all_f_label = np.array([])
all_x_label = np.array([])
all_y_label = np.array([])

for site in tqdm(all_sites):
    train_data = pd.read_csv(DATA_DIR+"/"+site+str("_train.csv"), index_col="path")
    test_data = pd.read_csv(DATA_DIR+"/"+site+str("_test.csv"))

    train_paths = train_data.index.unique()
    #non_targets = list(train_data.iloc[:,:-3].columns)
    non_targets = [i for i in train_data.columns if i not in ["timestamp", "x", "y", "f"]]

    # predict floor first
    X = train_data[non_targets+["f"]]
    X_test = test_data[non_targets]
    print(X.shape)

    label_f = np.array([])
    valid_f_pred = np.array([])
    test_f_pred = np.zeros([len(test_data), n_folds])
    for fold, (tr_group_idx, val_group_idx) in enumerate(kf.split(train_paths)):
        tr_paths, val_paths = train_paths[tr_group_idx], train_paths[val_group_idx]
    
        X_train = X[X.index.isin(tr_paths)].drop("f", axis=1).reset_index(drop=True)
        X_valid = X[X.index.isin(val_paths)].drop("f", axis=1).reset_index(drop=True)
        y_train = X[X.index.isin(tr_paths)]["f"].reset_index(drop=True)
        y_valid = X[X.index.isin(val_paths)]["f"].reset_index(drop=True)
    
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.LGBMClassifier(**lgbm_params_f)
        model.fit(X_train, y_train,
                       eval_set=[(X_valid, y_valid)],
                       eval_metric='multi_logloss',
                       verbose=False,
                       early_stopping_rounds=20
                       )
    
        tmp_pred = model.predict(X_valid).astype(int)
        valid_f_pred = np.concatenate([valid_f_pred, tmp_pred])
        label_f = np.concatenate([label_f, y_valid])
    
        test_f_pred[:, fold] = model.predict(X_test).astype(int)

    preds_f_mode = stats.mode(test_f_pred, axis=1)
    test_f_pred = preds_f_mode[0].astype(int).reshape(-1)
    accuracy = accuracy_score(label_f, valid_f_pred)

    # predict x based on floor information
    #non_targets = list(train_data.iloc[:,:-4].columns)
    #non_targets = [i for i in train_data.columns if i not in ["timestamp", "x", "y", "f"]]
    X = train_data[non_targets+["x"]]
    X_test = test_data[non_targets]
    X["f_pred"] = valid_f_pred
    X_test["f_pred"] = test_f_pred 

    label_x = np.array([])
    valid_x_pred = np.array([])
    test_x_pred = np.zeros(len(test_data))
    for fold, (tr_group_idx, val_group_idx) in enumerate(kf.split(train_paths)):
        tr_paths, val_paths = train_paths[tr_group_idx], train_paths[val_group_idx]
    
        X_train = X[X.index.isin(tr_paths)].drop("x", axis=1).reset_index(drop=True)
        X_valid = X[X.index.isin(val_paths)].drop("x", axis=1).reset_index(drop=True)
        y_train = X[X.index.isin(tr_paths)]["x"].reset_index(drop=True)
        y_valid = X[X.index.isin(val_paths)]["x"].reset_index(drop=True)
    
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        modelx = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 1000)
    
        tmp_pred = modelx.predict(X_valid, num_iteration = modelx.best_iteration)
        valid_x_pred = np.concatenate([valid_x_pred, tmp_pred])
        label_x = np.concatenate([label_x, y_valid])
    
        test_x_pred += modelx.predict(X_test, num_iteration = modelx.best_iteration) / n_folds
    
    score_x = np.sqrt(mean_squared_error(label_x, valid_x_pred))

    # predict x based on floor information
    X = train_data[non_targets+["y"]]
    X_test = test_data[non_targets]
    X["f_pred"] = valid_f_pred
    X_test["f_pred"] = test_f_pred

    label_y = np.array([])
    valid_y_pred = np.array([])
    test_y_pred = np.zeros(len(test_data))
    for fold, (tr_group_idx, val_group_idx) in enumerate(kf.split(train_paths)):
        tr_paths, val_paths = train_paths[tr_group_idx], train_paths[val_group_idx]
    
        X_train = X[X.index.isin(tr_paths)].drop("y", axis=1).reset_index(drop=True)
        X_valid = X[X.index.isin(val_paths)].drop("y", axis=1).reset_index(drop=True)
        y_train = X[X.index.isin(tr_paths)]["y"].reset_index(drop=True)
        y_valid = X[X.index.isin(val_paths)]["y"].reset_index(drop=True)
    
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        modely = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
                num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 1000)
    
        tmp_pred = modely.predict(X_valid, num_iteration = modely.best_iteration)
        valid_y_pred = np.concatenate([valid_y_pred, tmp_pred])
        label_y = np.concatenate([label_y, y_valid])
    
        test_y_pred += modely.predict(X_test, num_iteration = modely.best_iteration) / n_folds
    
    score_y = np.sqrt(mean_squared_error(label_y, valid_y_pred))
    print("each score:", accuracy, score_x, score_y)
    print("site:", site, comp_metric(valid_x_pred, valid_y_pred, valid_f_pred, label_x, label_y, label_f))

    test_preds = pd.DataFrame(np.stack((test_f_pred, test_x_pred, test_y_pred))).T
    test_preds.columns = sub.columns
    test_preds.index = test_data["site_path_timestamp"]
    test_preds["floor"] = test_preds["floor"].astype(int)
    predictions.append(test_preds)
    
    all_x_pred = np.concatenate([all_x_pred, valid_x_pred])
    all_x_label = np.concatenate([all_x_label, label_x])
    all_y_pred = np.concatenate([all_y_pred, valid_y_pred])
    all_y_label = np.concatenate([all_y_label, label_y])
    all_f_pred = np.concatenate([all_f_pred, valid_f_pred])
    all_f_label = np.concatenate([all_f_label, label_f])
print("overall: ", comp_metric(all_x_pred, all_y_pred, all_f_pred, all_x_label, all_y_label, all_f_label))

  0%|          | 0/24 [00:00<?, ?it/s]

(9296, 2599)
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[416]	training's rmse: 1.47383	valid_1's rmse: 14.1296
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[225]	training's rmse: 2.28133	valid_1's rmse: 7.3995
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[296]	training's rmse: 1.89637	valid_1's rmse: 12.1707
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[478]	training's rmse: 1.38637	valid_1's rmse: 10.6443
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[427]	training's rmse: 1.48313	valid_1's rmse: 12.7298
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[203]	training's rmse: 2.37495	valid_1's rmse: 12.7114
Training until validation scores don't improve for 20 rounds
Early stopping, best 

# postprocess

# submission

In [6]:
all_preds = pd.concat(predictions)
all_preds = all_preds.sort_values("site_path_timestamp")
all_preds.to_csv('submission.csv')