# blend002

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 

In [2]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [4]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)
    
def vis(df, phone):
    gt = get_ground_truth()
    gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
    tmp1 = df[df['phone']==phone].copy()
    tmp2 = gt[gt['phone']==phone].copy()
    
    tmp1['phoneName'] = 'train'
    tmp2['phoneName'] = 'gt'
    
    tmp = pd.concat([tmp1, tmp2])
    tmp['collectionName'] = 'all'
    
    lat_center = tmp['latDeg'].mean()
    lng_center = tmp['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
        
    visualize_trafic(tmp, center)

In [7]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [8]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [9]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [10]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [11]:
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv', usecols=['collectionName', 'phoneName', 'phone', 'millisSinceGpsEpoch'])
sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv', usecols=['phone', 'millisSinceGpsEpoch'])

In [12]:
sub_names = ['sub035', 'sub038', 'sub039', 'sub040', 'sub042', 'sub044']

In [13]:
for i, sub_name in enumerate(sub_names):
    sub_tmp = pd.read_csv(f'../output/{sub_name}/{sub_name}_train.csv', usecols=['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'])
    sub_tmp = sub_tmp.rename(columns={'latDeg':f'latDeg_pred{i}', 'lngDeg':f'lngDeg_pred{i}'})
    train = train.merge(sub_tmp, on=['phone', 'millisSinceGpsEpoch'])
    
    sub_tmp = pd.read_csv(f'../output/{sub_name}/{sub_name}_sub.csv', usecols=['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'])
    sub_tmp = sub_tmp.rename(columns={'latDeg':f'latDeg_pred{i}', 'lngDeg':f'lngDeg_pred{i}'})
    sub = sub.merge(sub_tmp, on=['phone', 'millisSinceGpsEpoch'])

In [14]:
def objective(trial):
    w0 = trial.suggest_uniform('w0', 0, 1)
    w1 = trial.suggest_uniform('w1', 0, 1)
    w2 = trial.suggest_uniform('w2', 0, 1)
    w3 = trial.suggest_uniform('w3', 0, 1)
    w4 = trial.suggest_uniform('w4', 0, 1)
    w5 = trial.suggest_uniform('w5', 0, 1)
    train_ = train.copy()
    
    w_sum = w0 + w1 + w2 + w3 + w4 + w5
    train_['latDeg'] = 0
    train_['latDeg'] += train_['latDeg_pred0'] * (w0/w_sum)
    train_['latDeg'] += train_['latDeg_pred1'] * (w1/w_sum)
    train_['latDeg'] += train_['latDeg_pred2'] * (w2/w_sum)
    train_['latDeg'] += train_['latDeg_pred3'] * (w3/w_sum)
    train_['latDeg'] += train_['latDeg_pred4'] * (w4/w_sum)
    train_['latDeg'] += train_['latDeg_pred5'] * (w5/w_sum)
    
    train_['lngDeg'] = 0
    train_['lngDeg'] += train_['lngDeg_pred0'] * (w0/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred1'] * (w1/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred2'] * (w2/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred3'] * (w3/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred4'] * (w4/w_sum)
    train_['lngDeg'] += train_['lngDeg_pred5'] * (w5/w_sum)
    
    score = train_result(train_).score
    return score

In [15]:
study = optuna.create_study()
study.optimize(objective, n_trials=1000)
print(study.best_params)

[32m[I 2021-08-04 16:27:32,379][0m A new study created in memory with name: no-name-c3be7f1c-fbfb-48a1-bac4-aff24a82293c[0m
[32m[I 2021-08-04 16:27:32,828][0m Trial 0 finished with value: 2.397473098745392 and parameters: {'w0': 0.1453677637704195, 'w1': 0.06650710581468045, 'w2': 0.342587844127056, 'w3': 0.8367055450339466, 'w4': 0.8173251695302299, 'w5': 0.0408539498841598}. Best is trial 0 with value: 2.397473098745392.[0m
[32m[I 2021-08-04 16:27:33,231][0m Trial 1 finished with value: 2.389993910471203 and parameters: {'w0': 0.7726270364839491, 'w1': 0.9495392413436631, 'w2': 0.11781044614306457, 'w3': 0.7951499324468178, 'w4': 0.8475592140630135, 'w5': 0.9624821554725786}. Best is trial 1 with value: 2.389993910471203.[0m
[32m[I 2021-08-04 16:27:33,623][0m Trial 2 finished with value: 2.406693829132556 and parameters: {'w0': 0.7312768499268049, 'w1': 0.6177092146629319, 'w2': 0.5146337994485557, 'w3': 0.23687362626154462, 'w4': 0.9182113873352423, 'w5': 0.22200873915698

{'w0': 0.0003615576500455347, 'w1': 0.0007327893508879604, 'w2': 0.0010857889024742686, 'w3': 0.03821026951541812, 'w4': 0.08095866105139875, 'w5': 0.8795197520664624}


In [16]:
w0 = study.best_params['w0']
w1 = study.best_params['w1']
w2 = study.best_params['w2']
w3 = study.best_params['w3']
w4 = study.best_params['w4']
w5 = study.best_params['w5']

w_sum = w0 + w1 + w2 + w3 + w4 + w5
train['latDeg'] = 0
train['latDeg'] += train['latDeg_pred0'] * (w0/w_sum)
train['latDeg'] += train['latDeg_pred1'] * (w1/w_sum)
train['latDeg'] += train['latDeg_pred2'] * (w2/w_sum)
train['latDeg'] += train['latDeg_pred3'] * (w3/w_sum)
train['latDeg'] += train['latDeg_pred4'] * (w4/w_sum)
train['latDeg'] += train['latDeg_pred5'] * (w5/w_sum)

train['lngDeg'] = 0
train['lngDeg'] += train['lngDeg_pred0'] * (w0/w_sum)
train['lngDeg'] += train['lngDeg_pred1'] * (w1/w_sum)
train['lngDeg'] += train['lngDeg_pred2'] * (w2/w_sum)
train['lngDeg'] += train['lngDeg_pred3'] * (w3/w_sum)
train['lngDeg'] += train['lngDeg_pred4'] * (w4/w_sum)
train['lngDeg'] += train['lngDeg_pred5'] * (w5/w_sum)

sub['latDeg'] = 0
sub['latDeg'] += sub['latDeg_pred0'] * (w0/w_sum)
sub['latDeg'] += sub['latDeg_pred1'] * (w1/w_sum)
sub['latDeg'] += sub['latDeg_pred2'] * (w2/w_sum)
sub['latDeg'] += sub['latDeg_pred3'] * (w3/w_sum)
sub['latDeg'] += sub['latDeg_pred4'] * (w4/w_sum)
sub['latDeg'] += sub['latDeg_pred5'] * (w5/w_sum)

sub['lngDeg'] = 0
sub['lngDeg'] += sub['lngDeg_pred0'] * (w0/w_sum)
sub['lngDeg'] += sub['lngDeg_pred1'] * (w1/w_sum)
sub['lngDeg'] += sub['lngDeg_pred2'] * (w2/w_sum)
sub['lngDeg'] += sub['lngDeg_pred3'] * (w3/w_sum)
sub['lngDeg'] += sub['lngDeg_pred4'] * (w4/w_sum)
sub['lngDeg'] += sub['lngDeg_pred5'] * (w5/w_sum)

In [17]:
result = train_result(train)
print(result.score)
display(result.err)

2.3287450623485606


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,1.066827,1.712843,1.389835
2020-05-14-US-MTV-1_Pixel4XLModded,0.981949,1.577993,1.279971
2020-05-14-US-MTV-2_Pixel4,1.753144,2.550732,2.151938
2020-05-14-US-MTV-2_Pixel4XLModded,1.721399,2.869122,2.29526
2020-05-21-US-MTV-1_Pixel4,1.929294,3.408669,2.668982
2020-05-21-US-MTV-2_Pixel4,0.634299,1.728228,1.181264
2020-05-21-US-MTV-2_Pixel4XL,0.576495,1.701215,1.138855
2020-05-29-US-MTV-1_Pixel4,1.793598,2.629031,2.211314
2020-05-29-US-MTV-1_Pixel4XL,1.786322,2.623479,2.204901
2020-05-29-US-MTV-1_Pixel4XLModded,1.782933,2.639939,2.211436


In [18]:
sub

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg_pred0,lngDeg_pred0,latDeg_pred1,lngDeg_pred1,latDeg_pred2,lngDeg_pred2,latDeg_pred3,lngDeg_pred3,latDeg_pred4,lngDeg_pred4,latDeg_pred5,lngDeg_pred5,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416609,-122.082028,37.416608,-122.082029,37.416608,-122.082030
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416608,-122.082024,37.416608,-122.082029,37.416608,-122.082029
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416609,-122.082028,37.416608,-122.082029,37.416608,-122.082030
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416605,-122.082039,37.416609,-122.082028,37.416608,-122.082029,37.416608,-122.082030
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416604,-122.082044,37.416604,-122.082044,37.416604,-122.082044,37.416604,-122.082044,37.416607,-122.082028,37.416608,-122.082026,37.416608,-122.082027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,37.334580,-121.899395,37.334579,-121.899396,37.334589,-121.899402,37.334589,-121.899401,37.334594,-121.899401,37.334593,-121.899400,37.334593,-121.899400
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,37.334580,-121.899395,37.334579,-121.899394,37.334589,-121.899403,37.334589,-121.899401,37.334594,-121.899401,37.334593,-121.899400,37.334593,-121.899400
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,37.334580,-121.899395,37.334580,-121.899396,37.334589,-121.899402,37.334589,-121.899401,37.334594,-121.899401,37.334593,-121.899400,37.334593,-121.899400
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,37.334580,-121.899395,37.334580,-121.899395,37.334588,-121.899403,37.334589,-121.899401,37.334594,-121.899401,37.334593,-121.899400,37.334593,-121.899400


In [19]:
sub_final = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
del sub_final['latDeg']
del sub_final['lngDeg']

In [20]:
sub_final = sub_final.merge(sub[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['phone', 'millisSinceGpsEpoch'], how='left')

In [21]:
sub_final.to_csv(f'{OUTPUT}/{nb_name}_sub.csv', index=False)

In [22]:
sub_final

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416608,-122.082030
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416608,-122.082029
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416608,-122.082030
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416608,-122.082030
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416608,-122.082027
...,...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,37.334593,-121.899400
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,37.334593,-121.899400
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,37.334593,-121.899400
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,37.334593,-121.899400
