- renew dataset (version 5: bssid more than 100 and timestamp, additional 25 files)
- cancel timestamp in some cases * (go back to version 11)
- add predictions for train-only sites *
- modify mistakes *

In [1]:
import os
import warnings
import numpy as np 
import pandas as pd
import multiprocessing
import lightgbm as lgb
from pathlib import Path
import scipy.stats as stats
import scipy.sparse
import scipy.interpolate
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score

warnings.filterwarnings("ignore")

In [2]:
VERSION = "013" # 実験番号

from pathlib import Path
from logging import getLogger, Formatter, FileHandler, StreamHandler, INFO, DEBUG

def create_logger(exp_version):
    log_file = ("{}.log".format(exp_version))

    # logger
    logger_ = getLogger(exp_version)
    logger_.setLevel(DEBUG)

    # formatter
    fmr = Formatter("[%(levelname)s] %(asctime)s >>\t%(message)s")

    # file handler
    fh = FileHandler(log_file)
    fh.setLevel(DEBUG)
    fh.setFormatter(fmr)

    # stream handler
    ch = StreamHandler()
    ch.setLevel(INFO)
    ch.setFormatter(fmr)

    logger_.addHandler(fh)
    logger_.addHandler(ch)


def get_logger(exp_version):
    return getLogger(exp_version)

create_logger(VERSION)
#get_logger(VERSION).info("what you want to save") 

In [3]:
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum() / xhat.shape[0]

In [4]:
ORG_DIR = '/kaggle/input/indoor-location-navigation'
DATA_DIR = '/kaggle/input/indoor-competition-dataset'

sub = pd.read_csv(ORG_DIR+"/sample_submission.csv")
train_files = [i for i in os.listdir(DATA_DIR) if "train" in i]
test_files = [i for i in os.listdir(DATA_DIR) if "test" in i]

all_sites = sorted(set(sub["site_path_timestamp"].apply(lambda x: x.split("_")[0])))
sub = sub.set_index("site_path_timestamp")

In [5]:
train_only_files = [i for i in os.listdir(DATA_DIR) if "train" in i and i.split("_")[0] not in all_sites]

# modelling

In [6]:
lgbm_params_f = {'objective': 'multiclass', 'metric': 'multi_logloss',
                     'boosting_type': 'gbdt','tree_learner': 'serial',
                     'learning_rate': 0.1, "num_leaves": 30, 
               'random_seed':44, 'n_estimators': 5000, 'verbose': -1}
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=224)
predictions = list()

all_f_pred = np.array([])
all_f_label = np.array([])

all_valid_f_df = []
all_feat_imp_df = []

for site in tqdm(all_sites):
    get_logger(VERSION).info(site)
    train_data = pd.read_csv(DATA_DIR+"/"+site+str("_train.csv"), index_col="path")
    test_data = pd.read_csv(DATA_DIR+"/"+site+str("_test.csv"))

    train_paths = train_data.index.unique()
    
    non_targets = [i for i in train_data.columns if i not in ["timestamp", "x", "y", "f"]]

    # predict floor first
    X = train_data[non_targets+["f"]]
    X_test = test_data[non_targets]
    get_logger(VERSION).info(X.shape)
    
    label_f = np.zeros([len(X)])
    valid_f_pred = np.zeros([len(X)])
    valid_f_path = ["tmp" for i in range(len(X))]
    test_f_pred = np.zeros([len(test_data), n_folds])
    feature_importance_df = pd.DataFrame(non_targets, columns=["Feature"])
    for fold, (tr_group_idx, val_group_idx) in enumerate(kf.split(train_paths)):
        tr_paths, val_paths = train_paths[tr_group_idx], train_paths[val_group_idx]
    
        tmp = X.reset_index(drop=False)
        val_index = tmp[tmp["path"].isin(val_paths)].index
        val_path = tmp.loc[val_index]["path"].values
        
        X_train = X[X.index.isin(tr_paths)].drop("f", axis=1).reset_index(drop=True)
        X_valid = X[X.index.isin(val_paths)].drop("f", axis=1).reset_index(drop=True)
        y_train = X[X.index.isin(tr_paths)]["f"].reset_index(drop=True)
        y_valid = X[X.index.isin(val_paths)]["f"].reset_index(drop=True)
    
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.LGBMClassifier(**lgbm_params_f)
        model.fit(X_train, y_train,
                       eval_set=[(X_valid, y_valid)],
                       eval_metric='multi_logloss',
                       verbose=False,
                       early_stopping_rounds=20
                       )
    
        valid_f_pred[val_index] = model.predict(X_valid).astype(int)
        for i, ind in enumerate(val_index):
            valid_f_path[ind] = val_path[i]
        
        feature_importance_df["Fold_"+str(fold+1)] = model.feature_importances_

        label_f[val_index] = y_valid
    
        test_f_pred[:, fold] = model.predict(X_test).astype(int)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    feature_importance_df["site"] = site
        
    all_f_pred = np.concatenate([all_f_pred, valid_f_pred])
    all_f_label = np.concatenate([all_f_label, label_f])
        
    accuracy = accuracy_score(label_f, valid_f_pred)
    get_logger(VERSION).info(accuracy)
    valid_f_pred = pd.DataFrame(valid_f_pred)
    valid_f_pred.columns = ["floor"]
    valid_f_pred["path"] = valid_f_path
    valid_f_pred["site"] = site
    valid_f_pred["truth"] = label_f
        
    preds_f_mode = stats.mode(test_f_pred, axis=1)
    test_f_pred = preds_f_mode[0].astype(int).reshape(-1)
    test_preds = pd.DataFrame(test_f_pred)
    test_preds.columns = ["floor"]
    test_preds.index = test_data["site_path_timestamp"]
    test_preds["floor"] = test_preds["floor"].astype(int)
    predictions.append(test_preds)
    
    all_valid_f_df.append(valid_f_pred)
    all_feat_imp_df.append(feature_importance_df)
    
all_valid_f_df = pd.concat(all_valid_f_df)
all_feat_imp_df = pd.concat(all_feat_imp_df)

print("overall: ", accuracy_score(all_valid_f_df.floor.astype(int).values, 
                                  all_valid_f_df.truth.astype(int).values))

all_valid_f_df.to_csv("floor_prediction_in_train.csv")
all_feat_imp_df.to_csv("floor_feature_importance.csv")

  0%|          | 0/24 [00:00<?, ?it/s]

[INFO] 2021-04-12 00:09:33,516 >>	5a0546857ecc773753327266
[INFO] 2021-04-12 00:09:39,960 >>	(9296, 2599)
[INFO] 2021-04-12 00:11:20,549 >>	0.9813898450946644
[INFO] 2021-04-12 00:11:20,571 >>	5c3c44b80379370013e0fd2b
[INFO] 2021-04-12 00:11:26,099 >>	(9737, 2351)
[INFO] 2021-04-12 00:12:50,792 >>	0.9731950292697956
[INFO] 2021-04-12 00:12:50,810 >>	5d27075f03f801723c2e360f
[INFO] 2021-04-12 00:13:26,996 >>	(23666, 5053)
[INFO] 2021-04-12 00:19:38,657 >>	0.9892673033043184
[INFO] 2021-04-12 00:19:38,675 >>	5d27096c03f801723c31e5e0
[INFO] 2021-04-12 00:19:40,167 >>	(9100, 734)
[INFO] 2021-04-12 00:20:13,169 >>	0.9903296703296703
[INFO] 2021-04-12 00:20:13,210 >>	5d27097f03f801723c320d97
[INFO] 2021-04-12 00:20:16,292 >>	(10507, 1402)
[INFO] 2021-04-12 00:21:01,590 >>	0.9808698962596364
[INFO] 2021-04-12 00:21:01,616 >>	5d27099f03f801723c32511d
[INFO] 2021-04-12 00:21:02,268 >>	(4251, 630)
[INFO] 2021-04-12 00:21:21,761 >>	0.988238061632557
[INFO] 2021-04-12 00:21:21,775 >>	5d2709a003f80

overall:  0.9814585956416465


In [7]:
lgbm_params_f = {'objective': 'multiclass', 'metric': 'multi_logloss',
                     'boosting_type': 'gbdt','tree_learner': 'serial',
                     'learning_rate': 0.1, "num_leaves": 30, 
               'random_seed':44, 'n_estimators': 5000, 'verbose': -1}
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=224)
predictions = list()

all_f_pred = np.array([])
all_f_label = np.array([])

all_valid_f_df = []
all_feat_imp_df = []

for file in tqdm(train_only_files):
    get_logger(VERSION).info(file)
    try:
        train_data = pd.read_csv(DATA_DIR+"/" +file, index_col="path")

        train_paths = train_data.index.unique()
        non_targets = [i for i in train_data.columns if i not in ["timestamp", "x", "y", "f"]]

        # predict floor first
        X = train_data[non_targets+["f"]]
        get_logger(VERSION).info(X.shape)
    
        label_f = np.zeros([len(X)])
        valid_f_pred = np.zeros([len(X)])
        valid_f_path = ["tmp" for i in range(len(X))]
        feature_importance_df = pd.DataFrame(non_targets, columns=["Feature"])
        for fold, (tr_group_idx, val_group_idx) in enumerate(kf.split(train_paths)):
            tr_paths, val_paths = train_paths[tr_group_idx], train_paths[val_group_idx]
    
            tmp = X.reset_index(drop=False)
            val_index = tmp[tmp["path"].isin(val_paths)].index
            val_path = tmp.loc[val_index]["path"].values
        
            X_train = X[X.index.isin(tr_paths)].drop("f", axis=1).reset_index(drop=True)
            X_valid = X[X.index.isin(val_paths)].drop("f", axis=1).reset_index(drop=True)
            y_train = X[X.index.isin(tr_paths)]["f"].reset_index(drop=True)
            y_valid = X[X.index.isin(val_paths)]["f"].reset_index(drop=True)
    
            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
            model = lgb.LGBMClassifier(**lgbm_params_f)
            model.fit(X_train, y_train,
                       eval_set=[(X_valid, y_valid)],
                       eval_metric='multi_logloss',
                       verbose=False,
                       early_stopping_rounds=20
                       )
    
            valid_f_pred[val_index] = model.predict(X_valid).astype(int)
            for i, ind in enumerate(val_index):
                valid_f_path[ind] = val_path[i]
        
            #feature_importance_df["Fold_"+str(fold+1)] = model.feature_importances_

            label_f[val_index] = y_valid
            
        #feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
        #feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
        #feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
        #feature_importance_df["site"] = file.split("_")[0]

        accuracy = accuracy_score(label_f, valid_f_pred)
        get_logger(VERSION).info(accuracy)
        
    except:
        valid_f_pred[:] = X["f"].values
        label_f[:] = X["f"].values
        get_logger(VERSION).info("pass")
        
    all_f_pred = np.concatenate([all_f_pred, valid_f_pred])
    #all_f_label = np.concatenate([all_f_label, label_f])
        
    valid_f_pred = pd.DataFrame(valid_f_pred)
    valid_f_pred.columns = ["floor"]
    valid_f_pred["path"] = valid_f_path
    valid_f_pred["site"] = file.split("_")[0]
    valid_f_pred["truth"] = label_f
            
    all_valid_f_df.append(valid_f_pred)
    #all_feat_imp_df.append(feature_importance_df)
    
all_valid_f_df = pd.concat(all_valid_f_df)
#all_feat_imp_df = pd.concat(all_feat_imp_df)

print("overall: ", accuracy_score(all_valid_f_df.floor.astype(int).values, 
                                  all_valid_f_df.truth.astype(int).values))

all_valid_f_df.to_csv("floor_prediction_in_train_only.csv")
#all_feat_imp_df.to_csv("floor_feature_importance_train_only.csv")

  0%|          | 0/25 [00:00<?, ?it/s]

[INFO] 2021-04-12 00:51:51,774 >>	5cd56b6fe2acfd2d33b5a386_train.csv
[INFO] 2021-04-12 00:51:52,023 >>	(3466, 297)
[INFO] 2021-04-12 00:51:58,763 >>	0.9422965954991345
[INFO] 2021-04-12 00:51:58,777 >>	5cd56b76e2acfd2d33b5b0be_train.csv
[INFO] 2021-04-12 00:51:59,201 >>	(4557, 437)
[INFO] 2021-04-12 00:52:05,526 >>	0.8512179065174457
[INFO] 2021-04-12 00:52:05,540 >>	5cd56b5ae2acfd2d33b58548_train.csv
[INFO] 2021-04-12 00:52:05,600 >>	(939, 140)
[INFO] 2021-04-12 00:52:08,119 >>	0.979765708200213
[INFO] 2021-04-12 00:52:08,132 >>	5cd56b67e2acfd2d33b596bd_train.csv
[INFO] 2021-04-12 00:52:08,380 >>	(5229, 201)
[INFO] 2021-04-12 00:52:17,222 >>	0.9544846050870147
[INFO] 2021-04-12 00:52:17,232 >>	5cd56b79e2acfd2d33b5b74e_train.csv
[INFO] 2021-04-12 00:52:17,311 >>	(1283, 164)
[INFO] 2021-04-12 00:52:19,089 >>	pass
[INFO] 2021-04-12 00:52:19,104 >>	5cd56b5ae2acfd2d33b58546_train.csv
[INFO] 2021-04-12 00:52:19,591 >>	(4996, 459)
[INFO] 2021-04-12 00:52:27,578 >>	0.8847077662129704
[INFO] 2

overall:  0.9317694641051567


# postprocess

In [8]:
all_preds = pd.concat(predictions)

ValueError: No objects to concatenate

# submission

In [9]:
all_preds = all_preds.sort_values("site_path_timestamp")
all_preds.to_csv('submission.csv')

NameError: name 'all_preds' is not defined