In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
import numpy as np
import glob
from tqdm.notebook import tqdm
import pickle

In [2]:
data_path = 'data2'

In [3]:
ssubm_df = pd.read_csv(os.path.join(data_path, 'input', 'sample_submission.csv'))
ssubm_df = ssubm_df['site_path_timestamp'].apply(lambda x: pd.Series(x.split('_')))
ssubm_df.columns = ['site', 'path', 'timestamp']

In [4]:
site_train_paths = glob.glob(os.path.join(data_path, 'output', 'features', 'train', '*'))

feauter_dfs = list()
for site_train_path in tqdm(site_train_paths):
    feauter_dfs.append(pd.read_csv(site_train_path))

  0%|          | 0/24 [00:00<?, ?it/s]

In [5]:
def create_train_floor_df(feauter_df):
    df = feauter_df.copy()
    Y_np = df['floor'].to_numpy()

    df.drop(['x', 'y', 'floor', 'path', 'sys_ts'], axis=1, inplace=True)
    X_np = df.to_numpy()
    
    return X_np, Y_np

In [9]:
def create_train_position_df(feauter_df):
    df = feauter_df.copy()
    Y_np = df[['x', 'y']].to_numpy()

    df.drop(['x', 'y', 'floor', 'path', 'sys_ts'], axis=1, inplace=True)
    X_np = df.to_numpy()
    
    return X_np, Y_np

In [8]:
floor_models = dict()

for i in tqdm(range(len(site_train_paths))):
    site = site_train_paths[i].split('/')[-1][:-4]
    
    X_np, Y_np = create_train_floor_df(feauter_dfs[i])
    
    floor_model = RandomForestClassifier()
    floor_model.fit(X_np, Y_np)
    
    floor_models[site] = floor_model

  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
for index in tqdm(floor_models):
    model = floor_models[index]
    pickle.dump(model, open(os.path.join(data_path, 'models', 'floor', index+'.sav'), 'wb'))

In [10]:
position_models = dict()

for i in tqdm(range(len(site_train_paths))):
    site = site_train_paths[i].split('/')[-1][:-4]
    site_model = dict()
    for gid, g in tqdm(feauter_dfs[i].groupby(['floor'])):
        floor = g['floor'].iloc[0]
        
        X_np, Y_np = create_train_position_df(g)

        #position_model = MultiOutputRegressor(ExtraTreesRegressor(n_estimators=40, n_jobs=-1))
        position_model = MultiOutputRegressor(AdaBoostRegressor(
            ExtraTreesRegressor(n_estimators=20, n_jobs=-1, max_features='sqrt', criterion='mse'), n_estimators=20))
        position_model.fit(X_np, Y_np)
        
        pickle.dump(position_model, open(os.path.join(data_path, 'models', 'position3', site+'_'+str(floor)+'.sav'), 'wb'))

        site_model[floor] = position_model
    
    position_models[site] = site_model
    #break

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
site_test_paths = glob.glob(os.path.join(data_path, 'output', 'features', 'test', '*'))

test_feauter_dfs = dict()
for site_test_path in tqdm(site_test_paths):
    site = site_test_path.split('/')[-1][:-4]
    test_feauter_dfs[site] = pd.read_csv(site_test_path)

  0%|          | 0/24 [00:00<?, ?it/s]

In [5]:
floor_models = dict()
floor_model_paths = glob.glob(os.path.join(data_path, 'models', 'floor', '*'))

for floor_model_path in tqdm(floor_model_paths):
    site = floor_model_path.split('/')[-1][:-4]
    floor_models[site] = pickle.load(open(floor_model_path, 'rb'))

  0%|          | 0/24 [00:00<?, ?it/s]

In [6]:
position_models = dict()
position_model_paths = glob.glob(os.path.join(data_path, 'models', 'position3', '*'))

for floor_model_path in floor_model_paths:
    site = floor_model_path.split('/')[-1][:-4]
    position_models[site] = dict()

for position_model_path in tqdm(position_model_paths):
    site = position_model_path.split('/')[-1][:-4].split('_')[0]
    floor = int(position_model_path.split('/')[-1][:-4].split('_')[1])
    position_models[site][floor] = pickle.load(open(position_model_path, 'rb'))

  0%|          | 0/139 [00:00<?, ?it/s]

In [23]:
for site in position_models:
    for floor in position_models[site]:
        position_models[site][floor].estimator.base_estimator.n_jobs = 1

In [24]:
position_models['5d2709bb03f801723c32852c'][0].estimator.base_estimator

ExtraTreesRegressor(max_features='sqrt', n_estimators=20, n_jobs=1)

In [65]:
for index, row in tqdm(ssubm_df.iterrows(), total=ssubm_df.shape[0]):
    test_site_feauter_df = test_feauter_dfs[row.site]
    test_path_feauter_df = test_site_feauter_df[test_site_feauter_df.path == row.path]
    #a teljes pathon a szint becslés
    
    dists = list()
    for i, row_feat in test_path_feauter_df.iterrows():
        dist = abs(int(row_feat.sys_ts) - int(row.timestamp))
        dists.append(dist)
    #nearest = test_path_feauter_df.iloc[np.argmin(dists)]
    nearests = test_path_feauter_df.iloc[np.argpartition(dists, 2)[:2]]
    #előtte utána nézni
    
    #X_np = np.array([nearest[:-2].to_numpy()])
    X_np = nearests.to_numpy()[:,:-2]
    
    floor_model = floor_models[row.site]
    floor_pred = floor_model.predict(X_np)[0]
    ssubm_df.loc[index, ['floor']] = floor_pred
    
    position_model = position_models[row.site][floor_pred]
    #position_pred = position_model.predict(X_np)[0]
    position_pred = position_model.predict(X_np)
        
    #ssubm_df.loc[index, ['x']] = position_pred[0]
    #ssubm_df.loc[index, ['y']] = position_pred[1]
    
    ssubm_df.loc[index, ['x']] = np.mean(position_pred[:,:1])
    ssubm_df.loc[index, ['y']] = np.mean(position_pred[:,1:])

    #break
    

  0%|          | 0/10133 [00:00<?, ?it/s]

In [29]:
for index, row in tqdm(ssubm_df.iterrows(), total=ssubm_df.shape[0]):
    test_site_feauter_df = test_feauter_dfs[row.site]
    test_path_feauter_df = test_site_feauter_df[test_site_feauter_df.path == row.path]
    
    X_np = test_path_feauter_df.to_numpy()[:,:-2]
    
    floor_model = floor_models[row.site]
    floor_pred = floor_model.predict(X_np)
    
    values, counts = np.unique(floor_pred, return_counts=True)
    ssubm_df.loc[index, ['floor']] = values[np.argmax(counts)]

  0%|          | 0/10133 [00:00<?, ?it/s]

In [101]:
for gid, g in tqdm(ssubm_df.groupby(['site','path'])):
    site = g.site.tolist()[0]
    path = g.path.tolist()[0]
    floor = g.floor.tolist()[0]
    
    df = test_feauter_dfs[site]
    df = df[df.path == path]
    
    position_model = position_models[site][floor]
    position_model.estimator.base_estimator.n_jobs=1
    
    for index, row in g.iterrows():
        X_np = df.iloc[np.argpartition(abs(df['sys_ts'] - int(row.timestamp)).tolist(), 2)[:2]].to_numpy()[:,:-2]
        
        position_pred = position_model.predict(X_np)
        
        ssubm_df.loc[index, ['x']] = np.mean(position_pred[:,:1])
        ssubm_df.loc[index, ['y']] = np.mean(position_pred[:,1:])

  0%|          | 0/626 [00:00<?, ?it/s]

In [102]:
ssubm_df['site_path_timestamp'] = ssubm_df['site'] + '_' + ssubm_df['path'] + '_' + ssubm_df['timestamp'] 
ssubm_df.drop(['site', 'path', 'timestamp'], axis=1, inplace=True)

cols = ssubm_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
ssubm_df = ssubm_df[cols]

In [103]:
ssubm_df

Unnamed: 0,site_path_timestamp,floor,x,y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0.0,91.640722,103.054010
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0.0,90.580822,103.903818
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0.0,88.216817,106.228281
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,0.0,88.581595,106.891557
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,0.0,90.012558,109.159755
...,...,...,...,...
10128,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5.0,216.569671,96.761062
10129,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5.0,215.548870,99.691792
10130,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5.0,210.649710,103.600256
10131,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5.0,204.563648,107.302695


In [104]:
ssubm_df.to_csv(os.path.join(data_path, 'output', 'submission_5_2.csv'), index=False)

RandomForestClassifier(n_estimators=20)