# exp065
相対位置で補正を掛けていく要素検証 sp0のdist_predを0にする処理のバグ修正

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 

In [2]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [4]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [7]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [8]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [9]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [10]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

# データ取得

In [11]:
train = pd.read_csv('../output/exp045/train_ro_rog_kf_sp0_pm_ps.csv')
gt = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')
degree = pd.read_csv('../output/prep/degree_pred_v002/train_degree_pred.csv')
dist = pd.read_csv('../output/prep/distance_pred_v002/train_distance_pred.csv')
sp0 = pd.read_csv('../output/prep/speed0_pred_v001/train_sp0_pred.csv')
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']

In [12]:
dist = dist.rename(columns={'pred': 'dist_pred'}) 

In [13]:
train = train.merge(degree[['phone', 'millisSinceGpsEpoch', 'calc_deg']], on=['phone', 'millisSinceGpsEpoch'], how='left')
train = train.merge(dist[['phone', 'millisSinceGpsEpoch', 'dist_pred']], on=['phone', 'millisSinceGpsEpoch'], how='left')
train = train.merge(sp0[['phone', 'millisSinceGpsEpoch', 'isSpeed0']], on=['phone', 'millisSinceGpsEpoch'], how='left')

In [14]:
train.loc[train['isSpeed0']==1, 'dist_pred'] = 0

In [15]:
train

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,phone,heightAboveWgs84EllipsoidM,x,y,z,...,ydiff,zp,zdiff,dist,xnew,ynew,znew,calc_deg,dist_pred,isSpeed0
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423582,-122.094127,2020-05-14-US-MTV-1_Pixel4,63.5,-2.694608e+06,-4.296551e+06,3.854871e+06,...,,,,,,,,0.981589,0.0,1
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423582,-122.094127,2020-05-14-US-MTV-1_Pixel4,63.5,-2.694608e+06,-4.296551e+06,3.854871e+06,...,0.000000,3.854871e+06,0.000000,0.000000,,,,0.787438,0.0,1
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423582,-122.094127,2020-05-14-US-MTV-1_Pixel4,63.5,-2.694608e+06,-4.296551e+06,3.854871e+06,...,0.000000,3.854871e+06,0.000000,0.000000,,,,0.705617,0.0,1
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423582,-122.094127,2020-05-14-US-MTV-1_Pixel4,63.5,-2.694608e+06,-4.296551e+06,3.854871e+06,...,0.000000,3.854871e+06,0.000000,0.000000,,,,0.645836,0.0,1
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423583,-122.094127,2020-05-14-US-MTV-1_Pixel4,63.5,-2.694608e+06,-4.296552e+06,3.854870e+06,...,-0.191047,3.854871e+06,-0.380402,0.490277,-2.694608e+06,-4.296551e+06,3.854871e+06,0.638460,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,38.975880,0.0,1
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,39.006773,0.0,1
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,39.003710,0.0,1
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,38.949827,0.0,1


# 相対座標

In [16]:
# 楕円体
ELLIPSOID_GRS80 = 1 # GRS80
ELLIPSOID_WGS84 = 2 # WGS84

# 楕円体別の長軸半径と扁平率
GEODETIC_DATUM = {
    ELLIPSOID_GRS80: [
        6378137.0,         # [GRS80]長軸半径
        1 / 298.257222101, # [GRS80]扁平率
    ],
    ELLIPSOID_WGS84: [
        6378137.0,         # [WGS84]長軸半径
        1 / 298.257223563, # [WGS84]扁平率
    ],
}

# 反復計算の上限回数
ITERATION_LIMIT = 1000

'''
Vincenty法(順解法)
始点の座標(緯度経度)と方位角と距離から、終点の座標と方位角を求める
:param lat: 緯度
:param lon: 経度
:param azimuth: 方位角
:param distance: 距離
:param ellipsoid: 楕円体
:return: 終点の座標、方位角
'''
def vincenty_direct(lat, lon, azimuth, distance, ellipsoid=None):

    # 計算時に必要な長軸半径(a)と扁平率(ƒ)を定数から取得し、短軸半径(b)を算出する
    # 楕円体が未指定の場合はGRS80の値を用いる
    a, ƒ = GEODETIC_DATUM.get(ellipsoid, GEODETIC_DATUM.get(ELLIPSOID_WGS84))
    b = (1 - ƒ) * a

    # ラジアンに変換する(距離以外)
    φ1 = radians(lat)
    λ1 = radians(lon)
    α1 = radians(azimuth)
    s = distance

    sinα1 = sin(α1)
    cosα1 = cos(α1)

    # 更成緯度(補助球上の緯度)
    U1 = atan((1 - ƒ) * tan(φ1))

    sinU1 = sin(U1)
    cosU1 = cos(U1)
    tanU1 = tan(U1)

    σ1 = atan2(tanU1, cosα1)
    sinα = cosU1 * sinα1
    cos2α = 1 - sinα ** 2
    u2 = cos2α * (a ** 2 - b ** 2) / (b ** 2)
    A = 1 + u2 / 16384 * (4096 + u2 * (-768 + u2 * (320 - 175 * u2)))
    B = u2 / 1024 * (256 + u2 * (-128 + u2 * (74 - 47 * u2)))

    # σをs/(b*A)で初期化
    σ = s / (b * A)

    # 以下の計算をσが収束するまで反復する
    # 地点によっては収束しないことがあり得るため、反復回数に上限を設ける
    for i in range(ITERATION_LIMIT):
        cos2σm = cos(2 * σ1 + σ)
        sinσ = sin(σ)
        cosσ = cos(σ)
        Δσ = B * sinσ * (cos2σm + B / 4 * (cosσ * (-1 + 2 * cos2σm ** 2) - B / 6 * cos2σm * (-3 + 4 * sinσ ** 2) * (-3 + 4 * cos2σm ** 2)))
        σʹ = σ
        σ = s / (b * A) + Δσ

        # 偏差が.000000000001以下ならbreak
        if abs(σ - σʹ) <= 1e-12:
            break
    else:
        # 計算が収束しなかった場合はNoneを返す
        return None

    # σが所望の精度まで収束したら以下の計算を行う
    x = sinU1 * sinσ - cosU1 * cosσ * cosα1
    φ2 = atan2(sinU1 * cosσ + cosU1 * sinσ * cosα1, (1 - ƒ) * sqrt(sinα ** 2 + x ** 2))
    λ = atan2(sinσ * sinα1, cosU1 * cosσ - sinU1 * sinσ * cosα1)
    C = ƒ / 16 * cos2α * (4 + ƒ * (4 - 3 * cos2α))
    L = λ - (1 - C) * ƒ * sinα * (σ + C * sinσ * (cos2σm + C * cosσ * (-1 + 2 * cos2σm ** 2)))
    λ2 = L + λ1

    α2 = atan2(sinα, -x) + pi

    return {
        'lat': degrees(φ2),     # 緯度
        'lon': degrees(λ2),     # 経度
        'azimuth': degrees(α2), # 方位角
    }

In [17]:
def calc_coordinate_forward(df, a):
    df_index = list(df.index)
    for idx in df_index:
        b = 1 - a
        
        if idx == df.index[-1]-1:
            break
        
        if df.at[idx, 'phone'] != df.at[idx+1, 'phone']:
            continue

        
        lat_hat_t0 = df.at[idx, 'latDeg']
        lng_hat_t0 = df.at[idx, 'lngDeg']
        lat_hat_t1 = df.at[idx+1, 'latDeg']
        lng_hat_t1 = df.at[idx+1, 'lngDeg']
        deg = df.at[idx, 'calc_deg']
        speed = df.at[idx, 'dist_pred']
        
        res = vincenty_direct(lat_hat_t0, lng_hat_t0, deg, speed)
        if res:
            lat_rel = res['lat']
            lng_rel = res['lon']
            
            lat_new = lat_hat_t1 * a + lat_rel * b
            lng_new = lng_hat_t1 * a + lng_rel * b
            
            df.at[idx+1, 'latDeg'] = lat_new
            df.at[idx+1, 'lngDeg'] = lng_new
    
    df_index.reverse()
    for idx in df_index:
        if idx == df.index[0]:
            break
        
        if df.at[idx, 'phone'] != df.at[idx-1, 'phone']:
            continue
            
        lat_hat_t0 = df.at[idx, 'latDeg']
        lng_hat_t0 = df.at[idx, 'lngDeg']
        lat_hat_t1 = df.at[idx-1, 'latDeg']
        lng_hat_t1 = df.at[idx-1, 'lngDeg']
        deg = df.at[idx, 'calc_deg']
        speed = df.at[idx, 'dist_pred']
        
        # 角度を反転
        if deg < 180:
            deg += 180
        else:
            deg -= 180        
        
        res = vincenty_direct(lat_hat_t0, lng_hat_t0, deg, speed)
        if res:
            lat_rel = res['lat']
            lng_rel = res['lon']
            
            lat_new = lat_hat_t1 * a + lat_rel * b
            lng_new = lng_hat_t1 * a + lng_rel * b
            
            df.at[idx-1, 'latDeg'] = lat_new
            df.at[idx-1, 'lngDeg'] = lng_new
        
        
    return df    

# 比率を変えて結果を確認

In [18]:
train_org = train.copy()
res = train_result(train)

In [19]:
print('raw', res.score)
for a in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    target_collections = ['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2']
    target_df = train_org.copy()
    target_df = calc_coordinate_forward(target_df, a)
    target_df = target_df[target_df['collectionName'].isin(target_collections)].copy()
    
    train = train_org.copy()
    train.loc[target_df.index, ['latDeg', 'lngDeg']] = target_df[['latDeg', 'lngDeg']]
    res = train_result(train)
    print(a, res.score)

raw 3.2930584887106034
0.1 3.370167872593592
0.2 3.2998210460967368
0.3 3.2930364260036975
0.4 3.284848255115682
0.5 3.279585892731464
0.6 3.2798634503599806
0.7 3.2856035538551396
0.8 3.289308057272305
0.9 3.292661907678184


# testでも実施

In [20]:
test_pred = pd.read_csv('../output/exp045/exp045_submission.csv')
test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
degree = pd.read_csv('../output/prep/degree_pred_v002/test_degree_pred.csv')
dist = pd.read_csv('../output/prep/distance_pred_v002/test_distance_pred.csv')
sp0 = pd.read_csv('../output/prep/speed0_pred_v001/test_sp0_pred.csv')
dist = dist.rename(columns={'pred': 'dist_pred'}) 
sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

In [21]:
test[['latDeg', 'lngDeg']] = test_pred[['latDeg', 'lngDeg']]

In [22]:
test = test.merge(degree[['phone', 'millisSinceGpsEpoch', 'calc_deg']], on=['phone', 'millisSinceGpsEpoch'], how='left')
test = test.merge(dist[['phone', 'millisSinceGpsEpoch', 'dist_pred']], on=['phone', 'millisSinceGpsEpoch'], how='left')
test = test.merge(sp0[['phone', 'millisSinceGpsEpoch', 'isSpeed0']], on=['phone', 'millisSinceGpsEpoch'], how='left')

In [23]:
test.loc[train['isSpeed0']==1, 'dist_pred'] = 0

In [24]:
target_df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,phone,heightAboveWgs84EllipsoidM,x,y,z,...,ydiff,zp,zdiff,dist,xnew,ynew,znew,calc_deg,dist_pred,isSpeed0
103937,2021-04-22-US-SJC-1,Pixel4,1303160575446,37.334583,-121.899416,2021-04-22-US-SJC-1_Pixel4,63.5,-2.683160e+06,-4.310775e+06,3.847021e+06,...,,,,,,,,35.296870,0.0,1
103938,2021-04-22-US-SJC-1,Pixel4,1303160576446,37.334582,-121.899417,2021-04-22-US-SJC-1_Pixel4,63.5,-2.683160e+06,-4.310775e+06,3.847021e+06,...,0.000000,3.847021e+06,0.000000,0.000000,,,,35.241133,0.0,1
103939,2021-04-22-US-SJC-1,Pixel4,1303160577446,37.334582,-121.899420,2021-04-22-US-SJC-1_Pixel4,63.5,-2.683160e+06,-4.310775e+06,3.847021e+06,...,-0.043778,3.847021e+06,0.018598,0.108196,-2.683160e+06,-4.310775e+06,3.847021e+06,36.144384,0.0,1
103940,2021-04-22-US-SJC-1,Pixel4,1303160578446,37.334583,-121.899416,2021-04-22-US-SJC-1_Pixel4,63.5,-2.683160e+06,-4.310775e+06,3.847021e+06,...,0.000000,3.847021e+06,0.000000,0.000000,,,,36.580848,0.0,1
103941,2021-04-22-US-SJC-1,Pixel4,1303160579446,37.334583,-121.899415,2021-04-22-US-SJC-1_Pixel4,63.5,-2.683160e+06,-4.310775e+06,3.847021e+06,...,0.000000,3.847021e+06,0.000000,0.000000,,,,36.755717,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,38.975880,0.0,1
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,39.006773,0.0,1
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,39.003710,0.0,1
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334472,-121.899604,2021-04-29-US-SJC-2_SamsungS20Ultra,63.5,-2.683178e+06,-4.310772e+06,3.847012e+06,...,0.000000,3.847012e+06,0.000000,0.000000,,,,38.949827,0.0,1


In [25]:
target_collections = ['2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3']
target_df = test.copy()
target_df = calc_coordinate_forward(target_df, 0.5)
target_df = target_df[target_df['collectionName'].isin(target_collections)].copy()

test.loc[target_df.index, ['latDeg', 'lngDeg']] = target_df[['latDeg', 'lngDeg']]

In [26]:
sub['latDeg'] = test['latDeg']
sub['lngDeg'] = test['lngDeg']
sub.to_csv(OUTPUT + f'/{nb_name}_submission.csv', index=False)