# exp131
cost_minimization検討

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
import scipy.interpolate
import scipy.sparse

In [2]:
g1 = ['2020-05-14-US-MTV-1', '2020-05-14-US-MTV-2', '2020-05-21-US-MTV-1', '2020-05-21-US-MTV-2',
      '2020-05-29-US-MTV-1', '2020-05-29-US-MTV-2', '2020-06-04-US-MTV-1', '2020-06-05-US-MTV-1',
      '2020-06-05-US-MTV-2', '2020-06-11-US-MTV-1', '2020-07-08-US-MTV-1', '2020-07-17-US-MTV-1',
      '2020-07-17-US-MTV-2', '2020-08-03-US-MTV-1', '2020-08-06-US-MTV-2', '2020-09-04-US-SF-1',
      '2020-09-04-US-SF-2',  '2021-01-04-US-RWC-1', '2021-01-04-US-RWC-2',
      '2020-05-15-US-MTV-1', '2020-05-28-US-MTV-1', '2020-05-28-US-MTV-2', '2020-06-04-US-MTV-2',
      '2020-06-10-US-MTV-1', '2020-06-10-US-MTV-2', '2020-08-03-US-MTV-2', '2020-08-13-US-MTV-1',
      '2021-03-16-US-MTV-2']

g2 = ['2021-01-05-US-SVL-1', '2021-01-05-US-SVL-2', '2021-04-15-US-MTV-1', 
      '2021-03-25-US-PAO-1', '2021-04-02-US-SJC-1', '2021-04-08-US-MTV-1']

g3 = ['2021-03-10-US-SVL-1', '2021-04-26-US-SVL-1', '2021-04-26-US-SVL-2']

g4 = ['2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1', 
      '2021-03-16-US-RWC-2', '2021-04-21-US-MTV-1', '2021-04-28-US-MTV-2', '2021-04-29-US-MTV-2']

g5 = ['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2', 
      '2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3']

In [3]:
g = 'g1-4_v015'

In [4]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [5]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [6]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [7]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [8]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [9]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [10]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [11]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [12]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [13]:
def update_baseline(df, rb):
    rb = rb.rename(columns={'latDeg':'latDeg_rb', 'lngDeg':'lngDeg_rb'})
    df = df.merge(rb[['millisSinceGpsEpoch', 'phone', 'latDeg_rb', 'lngDeg_rb']], on=['millisSinceGpsEpoch', 'phone'], how='left')

    idx = df[~df['latDeg_rb'].isnull()].index
    df.loc[idx, 'latDeg'] = df.loc[idx, 'latDeg_rb']
    df.loc[idx, 'lngDeg'] = df.loc[idx, 'lngDeg_rb']
    
    df.drop(columns=['latDeg_rb', 'lngDeg_rb'], inplace=True)
    
    return df

In [14]:
train = pd.read_csv(f'../output/{g}/{g}_train.csv')
test = pd.read_csv(f'../output/{g}/{g}_test.csv')

In [15]:
result = train_result(train)
print(result.score)
display(result.err)

2.328042818978987


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,0.930336,1.643508,1.286922
2020-05-14-US-MTV-1_Pixel4XLModded,0.856939,1.612388,1.234663
2020-05-14-US-MTV-2_Pixel4,1.873732,2.917014,2.395373
2020-05-14-US-MTV-2_Pixel4XLModded,1.797067,3.344879,2.570973
2020-05-21-US-MTV-1_Pixel4,2.020283,3.567715,2.793999
2020-05-21-US-MTV-2_Pixel4,0.690734,2.480687,1.585711
2020-05-21-US-MTV-2_Pixel4XL,0.628629,2.201404,1.415017
2020-05-29-US-MTV-1_Pixel4,1.76289,2.623871,2.19338
2020-05-29-US-MTV-1_Pixel4XL,1.761999,2.618125,2.190062
2020-05-29-US-MTV-1_Pixel4XLModded,1.760563,2.63316,2.196861


# cost_minimization

In [16]:
def cost_minimization(df):
    xy_hat = df[['latDeg', 'lngDeg']].to_numpy()
    delta_xy_hat = df[['lat_rel', 'lng_rel']].fillna(method='ffill')[1:].to_numpy()

    N = xy_hat.shape[0]
    alpha = 0.01 * np.ones(N)
    beta  = 0.04 * np.ones(N-1)

    A = scipy.sparse.spdiags(alpha, [0], N, N)
    B = scipy.sparse.spdiags(beta, [0], N-1, N-1)
    D = scipy.sparse.spdiags(np.stack([-np.ones(N), np.ones(N)]), [0, 1], N-1, N)

    Q = A + (D.T @ B @ D)
    c = (A @ xy_hat) + (D.T @ (B @ delta_xy_hat))
    xy_star = scipy.sparse.linalg.spsolve(Q, c)

    return xy_star

In [17]:
dp_train = pd.read_csv('../output/prep/doppler_processing_v001/train_result.csv')
dp_test = pd.read_csv('../output/prep/doppler_processing_v001/test_result.csv')

In [18]:
train = train.merge(dp_train[['millisSinceGpsEpoch', 'phone', 'lat_rel', 'lng_rel']], on=['millisSinceGpsEpoch', 'phone'], how='left')
test = test.merge(dp_test[['millisSinceGpsEpoch', 'phone', 'lat_rel', 'lng_rel']], on=['millisSinceGpsEpoch', 'phone'], how='left')

In [19]:
phones = train[(train['collectionName'].isin(g4))&(train['phoneName']!='SamsungS20Ultra')]['phone'].unique()
for phone in phones:
    idx = train[train['phone']==phone].index
    train.loc[idx, ['latDeg', 'lngDeg']] = cost_minimization(train.loc[idx])

In [20]:
result = train_result(train)
print(result.score)
display(result.err)

2.3142003585748268


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,0.930336,1.643508,1.286922
2020-05-14-US-MTV-1_Pixel4XLModded,0.856939,1.612388,1.234663
2020-05-14-US-MTV-2_Pixel4,1.873732,2.917014,2.395373
2020-05-14-US-MTV-2_Pixel4XLModded,1.797067,3.344879,2.570973
2020-05-21-US-MTV-1_Pixel4,2.020283,3.567715,2.793999
2020-05-21-US-MTV-2_Pixel4,0.690734,2.480687,1.585711
2020-05-21-US-MTV-2_Pixel4XL,0.628629,2.201404,1.415017
2020-05-29-US-MTV-1_Pixel4,1.76289,2.623871,2.19338
2020-05-29-US-MTV-1_Pixel4XL,1.761999,2.618125,2.190062
2020-05-29-US-MTV-1_Pixel4XLModded,1.760563,2.63316,2.196861


In [21]:
phones = train[(train['collectionName'].isin(g4))&(train['phoneName']!='SamsungS20Ultra')]['phone'].unique()
for phone in phones:
    idx = train[train['phone']==phone].index
    train.loc[idx, ['latDeg', 'lngDeg']] = cost_minimization(train.loc[idx])

In [22]:
result = train_result(train)
print(result.score)
display(result.err)

2.3101625859198296


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,0.930336,1.643508,1.286922
2020-05-14-US-MTV-1_Pixel4XLModded,0.856939,1.612388,1.234663
2020-05-14-US-MTV-2_Pixel4,1.873732,2.917014,2.395373
2020-05-14-US-MTV-2_Pixel4XLModded,1.797067,3.344879,2.570973
2020-05-21-US-MTV-1_Pixel4,2.020283,3.567715,2.793999
2020-05-21-US-MTV-2_Pixel4,0.690734,2.480687,1.585711
2020-05-21-US-MTV-2_Pixel4XL,0.628629,2.201404,1.415017
2020-05-29-US-MTV-1_Pixel4,1.76289,2.623871,2.19338
2020-05-29-US-MTV-1_Pixel4XL,1.761999,2.618125,2.190062
2020-05-29-US-MTV-1_Pixel4XLModded,1.760563,2.63316,2.196861


In [23]:
phones = train[(train['collectionName'].isin(g4))&(train['phoneName']!='SamsungS20Ultra')]['phone'].unique()
for phone in phones:
    idx = train[train['phone']==phone].index
    train.loc[idx, ['latDeg', 'lngDeg']] = cost_minimization(train.loc[idx])
    
result = train_result(train)
print(result.score)
display(result.err)

2.309298394426832


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,0.930336,1.643508,1.286922
2020-05-14-US-MTV-1_Pixel4XLModded,0.856939,1.612388,1.234663
2020-05-14-US-MTV-2_Pixel4,1.873732,2.917014,2.395373
2020-05-14-US-MTV-2_Pixel4XLModded,1.797067,3.344879,2.570973
2020-05-21-US-MTV-1_Pixel4,2.020283,3.567715,2.793999
2020-05-21-US-MTV-2_Pixel4,0.690734,2.480687,1.585711
2020-05-21-US-MTV-2_Pixel4XL,0.628629,2.201404,1.415017
2020-05-29-US-MTV-1_Pixel4,1.76289,2.623871,2.19338
2020-05-29-US-MTV-1_Pixel4XL,1.761999,2.618125,2.190062
2020-05-29-US-MTV-1_Pixel4XLModded,1.760563,2.63316,2.196861


In [24]:
phones = train[(train['collectionName'].isin(g4))&(train['phoneName']!='SamsungS20Ultra')]['phone'].unique()
for phone in phones:
    idx = train[train['phone']==phone].index
    train.loc[idx, ['latDeg', 'lngDeg']] = cost_minimization(train.loc[idx])
    
result = train_result(train)
print(result.score)
display(result.err)

2.3101623900698285


Unnamed: 0_level_0,percentile50,percentile95,p50_p90_mean
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-14-US-MTV-1_Pixel4,0.930336,1.643508,1.286922
2020-05-14-US-MTV-1_Pixel4XLModded,0.856939,1.612388,1.234663
2020-05-14-US-MTV-2_Pixel4,1.873732,2.917014,2.395373
2020-05-14-US-MTV-2_Pixel4XLModded,1.797067,3.344879,2.570973
2020-05-21-US-MTV-1_Pixel4,2.020283,3.567715,2.793999
2020-05-21-US-MTV-2_Pixel4,0.690734,2.480687,1.585711
2020-05-21-US-MTV-2_Pixel4XL,0.628629,2.201404,1.415017
2020-05-29-US-MTV-1_Pixel4,1.76289,2.623871,2.19338
2020-05-29-US-MTV-1_Pixel4XL,1.761999,2.618125,2.190062
2020-05-29-US-MTV-1_Pixel4XLModded,1.760563,2.63316,2.196861


In [21]:
phones = test[(test['collectionName'].isin(g4))&(test['phoneName']!='SamsungS20Ultra')]['phone'].unique()
for phone in phones:
    idx = test[test['phone']==phone].index
    test.loc[idx, ['latDeg', 'lngDeg']] = cost_minimization(test.loc[idx])

In [22]:
train.to_csv(OUTPUT + f'/{nb_name}_train.csv', index=False)

In [23]:
test.to_csv(OUTPUT + f'/{nb_name}_test.csv', index=False)