# blend001

In [58]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 

In [59]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [60]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [61]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [62]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [63]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)
    
def vis(df, phone):
    gt = get_ground_truth()
    gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
    tmp1 = df[df['phone']==phone].copy()
    tmp2 = gt[gt['phone']==phone].copy()
    
    tmp1['phoneName'] = 'train'
    tmp2['phoneName'] = 'gt'
    
    tmp = pd.concat([tmp1, tmp2])
    tmp['collectionName'] = 'all'
    
    lat_center = tmp['latDeg'].mean()
    lng_center = tmp['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
        
    visualize_trafic(tmp, center)

In [64]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [65]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [66]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [73]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [74]:
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv', usecols=['collectionName', 'phoneName', 'phone', 'millisSinceGpsEpoch'])
sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv', usecols=['phone', 'millisSinceGpsEpoch'])

In [75]:
sub_names = ['sub035', 'sub038', 'sub039', 'sub040', 'sub042']

In [76]:
for i, sub_name in enumerate(sub_names):
    sub_tmp = pd.read_csv(f'../output/{sub_name}/{sub_name}_train.csv', usecols=['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'])
    sub_tmp = sub_tmp.rename(columns={'latDeg':f'latDeg_pred{i}', 'lngDeg':f'lngDeg_pred{i}'})
    train = train.merge(sub_tmp, on=['phone', 'millisSinceGpsEpoch'])
    
    sub_tmp = pd.read_csv(f'../output/{sub_name}/{sub_name}_sub.csv', usecols=['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'])
    sub_tmp = sub_tmp.rename(columns={'latDeg':f'latDeg_pred{i}', 'lngDeg':f'lngDeg_pred{i}'})
    sub = sub.merge(sub_tmp, on=['phone', 'millisSinceGpsEpoch'])

In [77]:
def objective(trial):
    w0 = trial.suggest_uniform('w0', 0.01, 1)
    w1 = trial.suggest_uniform('w1', 0.01, 1)
    w2 = trial.suggest_uniform('w2', 0.01, 1)
    w3 = trial.suggest_uniform('w3', 0.01, 1)
    w4 = trial.suggest_uniform('w4', 0.01, 1)
    train_ = train.copy()
    
    w_sum = w0 + w1 + w2 + w3 + w4
    train_['latDeg'] = 0
    train_['latDeg'] += train_['latDeg_pred0'] * (w0/w_sum)
    train_['latDeg'] += train_['latDeg_pred1'] * (w1/w_sum)
    train_['latDeg'] += train_['latDeg_pred2'] * (w2/w_sum)
    train_['latDeg'] += train_['latDeg_pred3'] * (w3/w_sum)
    train_['latDeg'] += train_['latDeg_pred4'] * (w4/w_sum)
    
    train_['lngDeg'] = 0
    train_['lngDeg'] += train_['lngDeg_pred0'] * (w0/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred1'] * (w1/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred2'] * (w2/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred3'] * (w3/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred4'] * (w4/w_sum)
    
    score = train_result(train_).score
    return score

In [79]:
study = optuna.create_study()
study.optimize(objective, n_trials=1000)
print(study.best_params)

[32m[I 2021-08-04 04:16:33,190][0m A new study created in memory with name: no-name-26f6e0bc-9cc6-4f69-89dc-74994b62eceb[0m
[32m[I 2021-08-04 04:16:33,861][0m Trial 0 finished with value: 2.3818180119589534 and parameters: {'w0': 0.03864766724806854, 'w1': 0.018660526926107795, 'w2': 0.05629950171032416, 'w3': 0.6534223418742111, 'w4': 0.8652927318990361}. Best is trial 0 with value: 2.3818180119589534.[0m
[32m[I 2021-08-04 04:16:34,411][0m Trial 1 finished with value: 2.4305056259842743 and parameters: {'w0': 0.8938683932606668, 'w1': 0.5235513330465345, 'w2': 0.8120526023177573, 'w3': 0.385695444874787, 'w4': 0.7353777316197019}. Best is trial 0 with value: 2.3818180119589534.[0m
[32m[I 2021-08-04 04:16:34,964][0m Trial 2 finished with value: 2.4414528830752453 and parameters: {'w0': 0.366387203680525, 'w1': 0.06494125349596089, 'w2': 0.19036863455701736, 'w3': 0.32341706507436996, 'w4': 0.1473367065476829}. Best is trial 0 with value: 2.3818180119589534.[0m
[32m[I 2021-

{'w0': 0.0102184003284836, 'w1': 0.047504437577437726, 'w2': 0.011005201894501654, 'w3': 0.15071042769149892, 'w4': 0.7398554979310139}


In [37]:
train

Unnamed: 0,millisSinceGpsEpoch,phone,latDeg_pred0,lngDeg_pred0,latDeg_pred1,lngDeg_pred1,latDeg_pred2,lngDeg_pred2,latDeg_pred3,lngDeg_pred3,latDeg_pred4,lngDeg_pred4
0,1273529463442,2020-05-14-US-MTV-1_Pixel4,37.423582,-122.094122,37.423582,-122.094122,37.423582,-122.094122,37.423582,-122.094122,37.423537,-122.093985
1,1273529464442,2020-05-14-US-MTV-1_Pixel4,37.423582,-122.094122,37.423582,-122.094122,37.423582,-122.094122,37.423582,-122.094122,37.423537,-122.093985
2,1273529465442,2020-05-14-US-MTV-1_Pixel4,37.423586,-122.094122,37.423586,-122.094122,37.423586,-122.094122,37.423586,-122.094122,37.423537,-122.093985
3,1273529466442,2020-05-14-US-MTV-1_Pixel4,37.423582,-122.094122,37.423582,-122.094122,37.423582,-122.094122,37.423582,-122.094122,37.423535,-122.093985
4,1273529467442,2020-05-14-US-MTV-1_Pixel4,37.423583,-122.094119,37.423583,-122.094119,37.423583,-122.094119,37.423583,-122.094119,37.423553,-122.094038
...,...,...,...,...,...,...,...,...,...,...,...,...
131337,1303760315000,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334470,-121.899610,37.334471,-121.899612,37.334471,-121.899614,37.334472,-121.899610,37.334469,-121.899612
131338,1303760316000,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334470,-121.899610,37.334471,-121.899612,37.334471,-121.899614,37.334472,-121.899610,37.334469,-121.899612
131339,1303760317000,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334470,-121.899610,37.334471,-121.899612,37.334471,-121.899614,37.334472,-121.899610,37.334469,-121.899612
131340,1303760318000,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334470,-121.899610,37.334471,-121.899612,37.334471,-121.899614,37.334472,-121.899610,37.334469,-121.899612


In [39]:
sub

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg_pred0,lngDeg_pred0,latDeg_pred1,lngDeg_pred1,latDeg_pred2,lngDeg_pred2,latDeg_pred3,lngDeg_pred3,latDeg_pred4,lngDeg_pred4
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416609,-122.082028
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416608,-122.082024
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416609,-122.082028
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416609,-122.082028
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416604,-122.082044,37.416604,-122.082044,37.416604,-122.082044,37.416604,-122.082044,37.416607,-122.082028
...,...,...,...,...,...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,37.334580,-121.899395,37.334579,-121.899396,37.334589,-121.899402,37.334589,-121.899401,37.334594,-121.899401
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,37.334580,-121.899395,37.334579,-121.899394,37.334589,-121.899403,37.334589,-121.899401,37.334594,-121.899401
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,37.334580,-121.899395,37.334580,-121.899396,37.334589,-121.899402,37.334589,-121.899401,37.334594,-121.899401
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,37.334580,-121.899395,37.334580,-121.899395,37.334588,-121.899403,37.334589,-121.899401,37.334594,-121.899401


In [13]:
sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
train = pd.DataFrame()
test = pd.DataFrame()

for g in gs:
    train = train.append(pd.read_csv(f'../output/{g}/{g}_train.csv'))
    test = test.append(pd.read_csv(f'../output/{g}/{g}_test.csv'))
    
print(len(train))
print(len(test))

131342
91486


In [14]:
result = train_result(train)
print(result.score)
display(result.err)

2.35624345289089


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,1.049598,1.930156,1.489877
2020-05-14-US-MTV-1_Pixel4XLModded,0.968049,1.835652,1.401851
2020-05-14-US-MTV-2_Pixel4,1.799122,2.583508,2.191315
2020-05-14-US-MTV-2_Pixel4XLModded,1.755462,2.853909,2.304685
2020-05-21-US-MTV-1_Pixel4,1.953191,3.616279,2.784735
2020-05-21-US-MTV-2_Pixel4,0.628295,1.973055,1.300675
2020-05-21-US-MTV-2_Pixel4XL,0.580768,1.936618,1.258693
2020-05-29-US-MTV-1_Pixel4,1.726346,2.585113,2.15573
2020-05-29-US-MTV-1_Pixel4XL,1.708389,2.575766,2.142078
2020-05-29-US-MTV-1_Pixel4XLModded,1.711571,2.587238,2.149404


In [15]:
del sub['latDeg']
del sub['lngDeg']

In [16]:
sub = sub.merge(test[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['phone', 'millisSinceGpsEpoch'], how='left')

In [17]:
sub.to_csv(f'{OUTPUT}/{nb_name}_sub.csv', index=False)