In [11]:
import os
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor,GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor,VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import Ridge
import json
import numpy as np
import glob
from tqdm.notebook import tqdm
from texttable import Texttable

In [2]:
site_data_paths = glob.glob(os.path.join('data2', 'output', 'features', 'train', '*'))

In [3]:
feauter_dfs = list()
for data_path in tqdm(site_data_paths):
    feauter_dfs.append(pd.read_csv(data_path))

  0%|          | 0/24 [00:00<?, ?it/s]

In [4]:
def create_df_split(feauter_df):
    df = feauter_df.copy()
    Y_np = df[['x', 'y']].to_numpy()

    df.drop(['x', 'y', 'floor', 'path', 'sys_ts'], axis=1, inplace=True)
    X_np = df.to_numpy()
    
    X_train, X_val, Y_train, Y_val = train_test_split(X_np, Y_np, test_size = 0.2)
    del df
    return X_train, X_val, Y_train, Y_val

In [13]:
clf_data = [
    ('RandomForest 20', MultiOutputRegressor(RandomForestRegressor(n_estimators=20)), {}),
    #('RandomForest 40', MultiOutputRegressor(RandomForestRegressor(n_estimators=40)), {}),
    #('RandomForest 60', MultiOutputRegressor(RandomForestRegressor(n_estimators=60)), {}),
    ('ExtraTrees 20', MultiOutputRegressor(ExtraTreesRegressor(n_estimators=20)), {}),
    #('ExtraTrees 40', MultiOutputRegressor(ExtraTreesRegressor(n_estimators=40)), {}),
    #('ExtraTrees 50', MultiOutputRegressor(ExtraTreesRegressor(n_estimators=50)), {}),
    ('AdaBoost ExtraTrees 20', MultiOutputRegressor(AdaBoostRegressor(
        ExtraTreesRegressor(n_estimators=20, n_jobs=-1, max_features='sqrt', criterion='mse')
        , n_estimators=20)), {}),
    ('AdaBoost ExtraTrees 20', MultiOutputRegressor(AdaBoostRegressor(
        DecisionTreeRegressor(), n_estimators=20)), {}),
]

In [15]:
id = 3
feauter_df = feauter_dfs[id]

print(f"Site: {site_data_paths[id].split('/')[-1][:-4]}")

for gid, g in tqdm(feauter_df.groupby(['floor'])):
    X_train, X_val, Y_train, Y_val = create_df_split(g)
    
    print(f"Floor:\t{g['floor'].iloc[0]}")
    print(f"BSSIDs num:\t\t{X_train.shape[1]}")
    print(f"Mesurments num:\t\t{X_train.shape[0] + X_val.shape[0]}")
    
    cv_acc = list()
    val_acc = list()
    
    #clf = MultiOutputRegressor(Ridge(random_state=123))
    for clf_name, clf_class, clf_params in tqdm(clf_data):
        clf = clf_class#(**clf_params)
        clf.estimator.n_jobs = -1
        
        clf.fit(X_train, Y_train)
        cv_acc.append(np.mean(cross_val_score(clf, X_train, Y_train, cv=5)))
    
        val_acc.append(clf.score(X_val, Y_val))
    
    t = Texttable()
    t.add_rows([[x[0] for x in clf_data], cv_acc, val_acc])
    print(t.draw())
    print()
    break

Site: 5dc8cea7659e181adb076a3f


  0%|          | 0/8 [00:00<?, ?it/s]

Floor:	-1
BSSIDs num:		562
Mesurments num:		1420


  0%|          | 0/4 [00:00<?, ?it/s]

+-----------------+---------------+----------------------+---------------------+
| RandomForest 20 | ExtraTrees 20 | AdaBoost ExtraTrees  | AdaBoost ExtraTrees |
|                 |               |          20          |         20          |
| 0.905           | 0.913         | 0.908                | 0.899               |
+-----------------+---------------+----------------------+---------------------+
| 0.928           | 0.938         | 0.930                | 0.910               |
+-----------------+---------------+----------------------+---------------------+



In [6]:
feauter_df = feauter_dfs[3]

for gid, g in tqdm(feauter_df.groupby(['floor'])):
    X_train, X_val, Y_train, Y_val = create_df_split(g)
    
    #clf = MultiOutputRegressor(ExtraTreesRegressor(n_estimators=60, n_jobs=-1, max_features='sqrt', criterion='mse'))
    clf =  MultiOutputRegressor(AdaBoostRegressor(
        ExtraTreesRegressor(n_estimators=20, n_jobs=-1, max_features='sqrt', criterion='mse'), n_estimators=20))
    
    clf.fit(X_train, Y_train)
    val_accuracy = clf.score(X_val, Y_val)
    val_mean_error = getError(clf, X_val, Y_val)
    print(f'{val_accuracy}\t{val_mean_error[0]}\t{val_mean_error[1]}')
    #break

  0%|          | 0/8 [00:00<?, ?it/s]

0.9213692177427233	7.193184087983233	10.403398743371666
0.9914302378412576	4.336856651305459	3.377616130247799
0.9797876884566667	5.22511500057106	5.612436937844031
0.951604492827415	7.92417253793891	16.766812575532906
0.9918321207628628	3.9501280134233023	3.2769634716280494
0.9919214746364708	4.15385754585256	2.8479798347588585
0.990450021444059	4.58972841928333	4.128110180515435
0.8393038646006035	9.461138982799147	17.370315046277177


In [5]:
def getError(clf, X_val, Y_val):
    Y_pred = clf.predict(X_val)
    Y_real = Y_val

    diff = list()
    for i in range(Y_val.shape[0]):
        x_diff = Y_pred[i][0] - Y_real[i][0]
        y_diff = Y_pred[i][1] - Y_real[i][1]

        diff.append(np.sqrt(x_diff*x_diff + y_diff*y_diff))

    return (np.average(diff), np.std(diff))

In [90]:
Y_pred[:10]

array([[ 63.35098433,  74.67622875],
       [ 84.03833015,  65.78547357],
       [ 93.29825033,  61.393729  ],
       [ 84.03833015,  65.78547357],
       [ 84.03833015,  65.78547357],
       [ 68.72928827,  24.99608868],
       [134.241425  ,  80.852006  ],
       [114.28474083,  75.05464483],
       [ 63.6223795 ,  58.82946594],
       [134.241425  ,  80.852006  ]])

In [91]:
Y_real[:10]

array([[ 65.76824 ,  76.10711 ],
       [ 63.69822 ,  74.158295],
       [ 89.867424,  67.48126 ],
       [135.29718 ,  81.42091 ],
       [ 64.4487  ,  67.89565 ],
       [ 68.1448  ,  25.845957],
       [132.13501 ,  81.08364 ],
       [124.38364 ,  78.994514],
       [ 64.4487  ,  67.89565 ],
       [140.56067 ,  80.157104]])

In [96]:
diff[:10]

array([ 2.80901159, 21.99600462,  6.98774663, 53.59045214, 19.70295546,
        1.03145659,  2.11911266, 10.84021833,  9.1037629 ,  6.35733798])

In [12]:
feauter_dfs[0]

Unnamed: 0,2e5e319377bfd4678407b9060b825ada614f13f9,9d3d4b885a2477d90d3e54a6d2408637ea21fc09,a02f4615742c700b40222d78c416e5c61544513f,5bc24158fff7ca051eed8c0aed6408fa6b9d0271,575aee593cec75c5bcf809861e03064182c4c680,8ef36b90e7940363abc71814f9e572414f593a76,c4cb0c76011c72c848846607e71aa4e89956bae5,9c0d38e71d25bd4d899b2cd5f8e06ae8665e1eac,058c29e68ef40b50f3e6a0d26f43391d3b4ac315,3bdcad01934cd09e8cffe34e8a2468ed0fd1f43a,...,b76481fd7021b8ad9ca94ea59a0d6f974adec108,78fb989d22ece16d9887cc9a7687c35011703bc2,97d337bb5e21a20f10505d9b1d2b5a16bfbf6151,2f5d69532fc1dbe2964bde18edad2748983eaeb3,4e29706f6e2c281477bf89f2376e0da14bb9f04d,x,y,floor,path,sys_ts
0,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,186.17432,32.11038,1,5dc92809efaf870006bad708,1573462234097
1,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,183.51108,30.73431,1,5dc92809efaf870006bad708,1573462235991
2,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,183.51108,30.73431,1,5dc92809efaf870006bad708,1573462237879
3,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,183.51108,30.73431,1,5dc92809efaf870006bad708,1573462239786
4,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999,...,-999,-999,-999,-999,-999,183.51108,30.73431,1,5dc92809efaf870006bad708,1573462241672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17198,-75,-79,-77,-74,-88,-87,-79,-83,-79,-84,...,-999,-999,-999,-999,-999,157.48116,246.42839,-1,5ddccefd29dff70006248e00,1574751981224
17199,-68,-76,-78,-68,-87,-86,-76,-83,-77,-91,...,-999,-999,-999,-999,-90,164.04048,249.67061,-1,5ddccefd29dff70006248e00,1574751983181
17200,-71,-72,-73,-70,-88,-88,-71,-83,-71,-91,...,-999,-999,-999,-999,-90,164.04048,249.67061,-1,5ddccefd29dff70006248e00,1574751985146
17201,-74,-78,-75,-76,-89,-90,-82,-83,-80,-91,...,-999,-999,-999,-999,-90,164.04048,249.67061,-1,5ddccefd29dff70006248e00,1574751987149
