# exp046
速度・角度での相対座標で欠損埋めを、gtデータで処理確認

In [183]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 

In [184]:
ro_th = 50 # 相対移動距離をもとにした異常値除去の閾値
rog_th = 10 # ground_truthをもとにした異常値除去の閾値

# ground_truthをもとにした異常値除去を行うcollection
rog_target = ['2021-04-22-US-SJC-1', '2021-04-29-US-SJC-2', '2021-04-28-US-SJC-1', '2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3',
              '2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1', '2021-03-16-US-RWC-2', '2021-04-28-US-MTV-2', '2021-04-29-US-MTV-2',
              '2021-04-26-US-SVL-2', '2021-03-10-US-SVL-1', '2021-04-26-US-SVL-1',
              '2021-04-21-US-MTV-1', '2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1']

In [185]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [186]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [187]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [188]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [189]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [190]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [191]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [192]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [193]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [194]:
def calc_degree_by_gt(df):
    phones = df['phone'].unique()
    df['deg_pred'] = np.nan
    
    for idx in range(len(df)-1):
        if df.at[idx, 'phone'] != df.at[idx+1, 'phone']:
            continue
            
        lat = df.at[idx, 'latDeg_gt']
        lng = df.at[idx, 'lngDeg_gt']
        lat_next = df.at[idx+1, 'latDeg_gt']
        lng_next = df.at[idx+1, 'lngDeg_gt']
        
        res = vincenty_inverse(lat, lng, lat_next, lng_next)
        if res:
            df.at[idx, 'deg_pred'] = res
    
    return df

In [195]:
# Vincenty's formulae
# refarence https://qiita.com/r-fuji/items/99ca549b963cedc106ab

def vincenty_inverse(lat1, lon1, lat2, lon2):

    # Not advanced
    if isclose(lat1, lat2) and isclose(lon1, lon2):
        return False
    
    # WGS84
    a = 6378137.0
    ƒ = 1 / 298.257223563
    b = (1 - ƒ) * a

    lat_1 = atan((1 - ƒ) * tan(radians(lat1)))
    lat_2 = atan((1 - ƒ) * tan(radians(lat2)))
    
    lon_diff = radians(lon2) - radians(lon1)
    λ = lon_diff

    for i in range(1000):
        sinλ = sin(λ)
        cosλ = cos(λ)
        sinσ = sqrt((cos(lat_2) * sinλ) ** 2 + (cos(lat_1) * sin(lat_2) - sin(lat_1) * cos(lat_2) * cosλ) ** 2)
        cosσ = sin(lat_1) * sin(lat_2) + cos(lat_1) * cos(lat_2) * cosλ
        σ = atan2(sinσ, cosσ)
        sinα = cos(lat_1) * cos(lat_2) * sinλ / sinσ
        cos2α = 1 - sinα ** 2
        cos2σm = cosσ - 2 * sin(lat_1) * sin(lat_2) / cos2α
        C = ƒ / 16 * cos2α * (4 + ƒ * (4 - 3 * cos2α))
        λʹ = λ
        λ = lon_diff + (1 - C) * ƒ * sinα * (σ + C * sinσ * (cos2σm + C * cosσ * (-1 + 2 * cos2σm ** 2)))
        
        if abs(λ - λʹ) <= 1e-12:
            break
    else:
        return None

    α = atan2(cos(lat_2) * sinλ, cos(lat_1) * sin(lat_2) - sin(lat_1) * cos(lat_2) * cosλ)

    if α < 0:
        α = α + pi * 2

    return degrees(α)

# 相対移動距離をもとにした外れ値除去

In [196]:
train, test, sub, gt = get_data()
gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
train = train.merge(gt[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'speedMps', 'latDeg_gt', 'lngDeg_gt']], on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
train = calc_degree_by_gt(train)
_, _, _, gt = get_data()

In [197]:
def add_distance_diff(df):
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev']] = np.nan
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next']] = np.nan
    
    return df

In [198]:
# reject outlier
train = add_distance_diff(train)
train.loc[((train['dist_prev'] > ro_th) & (train['dist_next'] > ro_th)), ['latDeg', 'lngDeg']] = np.nan
train.to_csv(OUTPUT + '/train_ro.csv', index=False)

# ground_truthを基準にした外れ値除去

In [199]:
def get_osmnx_data():
    p = pathlib.Path(INPUT)
    files = list(p.glob('prep/osmnx/*.csv'))

    dfs = []
    for file in files:
        dfs.append(pd.read_csv(file))
    osmnx = pd.concat(dfs)

    return osmnx

In [200]:
def remove_based_on_gt(target_df, target_gt, th):
    osmnx_data = get_osmnx_data()
    target_gt = target_gt.append(osmnx_data)
    
    for idx in target_df.index:
        lat = target_df.at[idx, 'latDeg']
        lng = target_df.at[idx, 'lngDeg']
        collection = target_df.at[idx, 'collectionName']
        
        if collection in ['2021-04-22-US-SJC-1', '2021-04-29-US-SJC-2', '2021-04-28-US-SJC-1', '2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3']:
            th = 3
        
        if np.isnan(lat):
            continue
        else:
            target_gt['latDeg_pred'] = lat
            target_gt['lngDeg_pred'] = lng
            target_gt['dist'] = calc_haversine(target_gt['latDeg'], target_gt['lngDeg'], target_gt['latDeg_pred'], target_gt['lngDeg_pred'])
            closest_dist = target_gt['dist'].min()
            if closest_dist > th:
                target_df.at[idx, 'latDeg'] = np.nan
                target_df.at[idx, 'lngDeg'] = np.nan
    return target_df

In [201]:
# 楕円体
ELLIPSOID_GRS80 = 1 # GRS80
ELLIPSOID_WGS84 = 2 # WGS84

# 楕円体別の長軸半径と扁平率
GEODETIC_DATUM = {
    ELLIPSOID_GRS80: [
        6378137.0,         # [GRS80]長軸半径
        1 / 298.257222101, # [GRS80]扁平率
    ],
    ELLIPSOID_WGS84: [
        6378137.0,         # [WGS84]長軸半径
        1 / 298.257223563, # [WGS84]扁平率
    ],
}

# 反復計算の上限回数
ITERATION_LIMIT = 1000

'''
Vincenty法(順解法)
始点の座標(緯度経度)と方位角と距離から、終点の座標と方位角を求める
:param lat: 緯度
:param lon: 経度
:param azimuth: 方位角
:param distance: 距離
:param ellipsoid: 楕円体
:return: 終点の座標、方位角
'''
def vincenty_direct(lat, lon, azimuth, distance, ellipsoid=None):

    # 計算時に必要な長軸半径(a)と扁平率(ƒ)を定数から取得し、短軸半径(b)を算出する
    # 楕円体が未指定の場合はGRS80の値を用いる
    a, ƒ = GEODETIC_DATUM.get(ellipsoid, GEODETIC_DATUM.get(ELLIPSOID_WGS84))
    b = (1 - ƒ) * a

    # ラジアンに変換する(距離以外)
    φ1 = radians(lat)
    λ1 = radians(lon)
    α1 = radians(azimuth)
    s = distance

    sinα1 = sin(α1)
    cosα1 = cos(α1)

    # 更成緯度(補助球上の緯度)
    U1 = atan((1 - ƒ) * tan(φ1))

    sinU1 = sin(U1)
    cosU1 = cos(U1)
    tanU1 = tan(U1)

    σ1 = atan2(tanU1, cosα1)
    sinα = cosU1 * sinα1
    cos2α = 1 - sinα ** 2
    u2 = cos2α * (a ** 2 - b ** 2) / (b ** 2)
    A = 1 + u2 / 16384 * (4096 + u2 * (-768 + u2 * (320 - 175 * u2)))
    B = u2 / 1024 * (256 + u2 * (-128 + u2 * (74 - 47 * u2)))

    # σをs/(b*A)で初期化
    σ = s / (b * A)

    # 以下の計算をσが収束するまで反復する
    # 地点によっては収束しないことがあり得るため、反復回数に上限を設ける
    for i in range(ITERATION_LIMIT):
        cos2σm = cos(2 * σ1 + σ)
        sinσ = sin(σ)
        cosσ = cos(σ)
        Δσ = B * sinσ * (cos2σm + B / 4 * (cosσ * (-1 + 2 * cos2σm ** 2) - B / 6 * cos2σm * (-3 + 4 * sinσ ** 2) * (-3 + 4 * cos2σm ** 2)))
        σʹ = σ
        σ = s / (b * A) + Δσ

        # 偏差が.000000000001以下ならbreak
        if abs(σ - σʹ) <= 1e-12:
            break
    else:
        # 計算が収束しなかった場合はNoneを返す
        return None

    # σが所望の精度まで収束したら以下の計算を行う
    x = sinU1 * sinσ - cosU1 * cosσ * cosα1
    φ2 = atan2(sinU1 * cosσ + cosU1 * sinσ * cosα1, (1 - ƒ) * sqrt(sinα ** 2 + x ** 2))
    λ = atan2(sinσ * sinα1, cosU1 * cosσ - sinU1 * sinσ * cosα1)
    C = ƒ / 16 * cos2α * (4 + ƒ * (4 - 3 * cos2α))
    L = λ - (1 - C) * ƒ * sinα * (σ + C * sinσ * (cos2σm + C * cosσ * (-1 + 2 * cos2σm ** 2)))
    λ2 = L + λ1

    α2 = atan2(sinα, -x) + pi

    return {
        'lat': degrees(φ2),     # 緯度
        'lon': degrees(λ2),     # 経度
        'azimuth': degrees(α2), # 方位角
    }

In [202]:
def interpolate_by_deg_dist_pred(df):    
    # 前後座標の再設定
    for c in ['latDeg', 'lngDeg', 'deg_pred', 'dist_pred']:
        df[c+'_prev'] = df[c].shift(1)
        df[c+'_next'] = df[c].shift(-1)
        df.loc[df['phone']!=df['phone'].shift(1), c] = np.nan
        df.loc[df['phone']!=df['phone'].shift(-1), c] = np.nan
    
    # 欠損値グループ生成
    null_idx = df[df['latDeg'].isnull()].index
    prev_not_null_idx = df[~df['latDeg_prev'].isnull()].index
    head_idx = df.groupby('phone').head(1).index
    df.loc[list(set(null_idx) & set(prev_not_null_idx)), 'g'] = 1
    df.loc[list(set(null_idx) & set(head_idx)), 'g'] = 1
    df['g'] = df['g'].cumsum()
    df['g'] = df['g'].fillna(method='ffill')
    df.loc[~df['latDeg'].isnull(),'g'] = np.nan
    

    for g in df['g'].dropna().unique():
        target_idx = list(df[df['g']==g].index)
        lat_forward_results = np.full(len(target_idx), np.nan)
        lng_forward_results = np.full(len(target_idx), np.nan)
        lat_reversed_results = np.full(len(target_idx), np.nan)
        lng_reversed_results = np.full(len(target_idx), np.nan)

        # 順方向の補間
        for i, idx in enumerate(target_idx):
            if i==0:
                lat = df.at[idx, 'latDeg_prev']
                lng = df.at[idx, 'lngDeg_prev']
                lat_interp_prev = np.nan
                lng_interp_prev = np.nan
            else:
                lat = lat_interp_prev
                lng = lng_interp_prev
            deg = df.at[idx, 'deg_pred_prev']
            dist = df.at[idx, 'dist_pred_prev']

            if np.isnan(lat) or np.isnan(lng) or np.isnan(deg) or np.isnan(dist):
                break
            else:
                res = vincenty_direct(lat, lng, deg, dist)
                lat_forward_results[i] = res['lat']
                lat_interp_prev = res['lat']
                lng_forward_results[i] = res['lon']
                lng_interp_prev = res['lon']

        # 逆方向の補間
        target_idx.reverse()
        for i, idx in enumerate(target_idx):
            if i==0:
                lat = df.at[idx, 'latDeg_next']
                lng = df.at[idx, 'lngDeg_next']
                lat_interp_next = np.nan
                lng_interp_next = np.nan
            else:
                lat = lat_interp_next
                lng = lng_interp_next
            deg = df.at[idx, 'deg_pred_next']
            dist = df.at[idx, 'dist_pred_next']
            
            # 角度を反転
            if deg < 180:
                deg += 180
            else:
                deg -= 180

            if np.isnan(lat) or np.isnan(lng) or np.isnan(deg) or np.isnan(dist):
                break
            else:
                res = vincenty_direct(lat, lng, deg, dist)
                lat_reversed_results[i] = res['lat']
                lat_interp_next = res['lat']
                lng_reversed_results[i] = res['lon']
                lng_interp_next = res['lon']

        # 順方向と逆方向の結果を平均して、補間値を算出
        # lat_results = np.nanmean(np.array([lat_forward_results, lat_reversed_results[::-1]]), axis=0)
        # lng_results = np.nanmean(np.array([lng_forward_results, lng_reversed_results[::-1]]), axis=0)

        # 結果を元のdataframeに格納
        # target_idx.reverse()
        df.loc[target_idx, 'lat_forward'] = lat_forward_results
        df.loc[target_idx, 'lng_forward'] = lng_forward_results
        df.loc[target_idx, 'lat_reversed'] = lat_reversed_results[::-1]
        df.loc[target_idx, 'lng_reversed'] = lng_reversed_results[::-1]
        
    return df

In [203]:
rog_df = train[train['collectionName'].isin(rog_target)].copy()
rog_gt = gt[gt['collectionName'].isin(rog_target)].copy()
rog_df = remove_based_on_gt(rog_df, rog_gt, rog_th)

In [204]:
gt

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree
0,2020-05-21-US-MTV-2,Pixel4,1274131364434,37.628927,-122.426295,65.36,64.43,2.60,0.0,0.0,154.3
1,2020-05-21-US-MTV-2,Pixel4,1274131365434,37.628927,-122.426295,65.36,65.43,2.60,0.0,0.0,154.3
2,2020-05-21-US-MTV-2,Pixel4,1274131366434,37.628927,-122.426295,65.36,66.43,2.60,0.0,0.0,154.3
3,2020-05-21-US-MTV-2,Pixel4,1274131367434,37.628927,-122.426295,65.36,67.43,2.60,0.0,0.0,154.3
4,2020-05-21-US-MTV-2,Pixel4,1274131368434,37.628927,-122.426295,65.36,68.43,2.60,0.0,0.0,154.3
...,...,...,...,...,...,...,...,...,...,...,...
2001,2020-05-29-US-MTV-2,Pixel4XL,1274832059447,37.415923,-122.080712,37.54,2120.45,0.95,0.0,0.0,303.1
2002,2020-05-29-US-MTV-2,Pixel4XL,1274832060447,37.415923,-122.080712,37.54,2121.45,1.40,0.0,0.0,303.1
2003,2020-05-29-US-MTV-2,Pixel4XL,1274832061447,37.415923,-122.080712,37.54,2122.45,1.60,0.0,0.0,303.1
2004,2020-05-29-US-MTV-2,Pixel4XL,1274832062447,37.415923,-122.080712,37.54,2123.45,1.75,0.0,0.0,303.1


In [205]:
dist = pd.read_csv('../output/prep/distance_pred_v001/train_distance_pred.csv')
dist = dist.rename(columns={'pred':'dist_pred'})
deg = pd.read_csv('../output/prep/degree_pred_v001/train_degree_pred.csv')
deg = deg.rename(columns={'calc_deg':'deg_pred'})

idx = rog_df.index
#rog_df = rog_df.merge(dist, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
#rog_df = rog_df.merge(deg, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
rog_df = rog_df.rename(columns={'speedMps':'dist_pred'})

rog_df['idx'] = idx
rog_df = rog_df.set_index('idx')

In [206]:
rog_df

Unnamed: 0_level_0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,dist_pred,latDeg_gt,lngDeg_gt,deg_pred,latDeg_prev,latDeg_next,lngDeg_prev,lngDeg_next,phone_prev,phone_next,dist_prev,dist_next
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
93986,2021-03-10-US-SVL-1,Pixel4XL,1299453168638,37.371256,-122.048081,24.93,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.047980,,,37.371295,,-122.048026,2021-01-05-US-SVL-2_Pixel4XL,2021-03-10-US-SVL-1_Pixel4XL,,6.536023
93987,2021-03-10-US-SVL-1,Pixel4XL,1299453169651,37.371295,-122.048026,6.56,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.047980,,37.371256,37.371211,-122.048081,-122.048033,2021-03-10-US-SVL-1_Pixel4XL,2021-03-10-US-SVL-1_Pixel4XL,6.536023,9.354958
93988,2021-03-10-US-SVL-1,Pixel4XL,1299453170646,37.371211,-122.048033,19.14,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.047980,,37.371295,37.371200,-122.048026,-122.048134,2021-03-10-US-SVL-1_Pixel4XL,2021-03-10-US-SVL-1_Pixel4XL,9.354958,9.042584
93989,2021-03-10-US-SVL-1,Pixel4XL,1299453171664,37.371200,-122.048134,24.47,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.047980,,37.371211,37.371194,-122.048033,-122.048102,2021-03-10-US-SVL-1_Pixel4XL,2021-03-10-US-SVL-1_Pixel4XL,9.042584,2.853071
93990,2021-03-10-US-SVL-1,Pixel4XL,1299453172650,37.371194,-122.048102,21.83,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.047980,,37.371200,37.371203,-122.048134,-122.048075,2021-03-10-US-SVL-1_Pixel4XL,2021-03-10-US-SVL-1_Pixel4XL,2.853071,2.626328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,-8.09,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,37.334475,-121.899613,,37.334457,37.334472,-121.899610,-121.899583,2021-04-29-US-SJC-2_SamsungS20Ultra,2021-04-29-US-SJC-2_SamsungS20Ultra,0.941490,2.011345
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,-7.59,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,37.334475,-121.899613,,37.334460,37.334491,-121.899600,-121.899597,2021-04-29-US-SJC-2_SamsungS20Ultra,2021-04-29-US-SJC-2_SamsungS20Ultra,2.011345,2.447052
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,-8.35,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,37.334475,-121.899613,176.524924,37.334472,37.334495,-121.899583,-121.899583,2021-04-29-US-SJC-2_SamsungS20Ultra,2021-04-29-US-SJC-2_SamsungS20Ultra,2.447052,1.303535
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,-8.73,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,37.334475,-121.899613,,37.334491,37.334485,-121.899597,-121.899570,2021-04-29-US-SJC-2_SamsungS20Ultra,2021-04-29-US-SJC-2_SamsungS20Ultra,1.303535,1.582827


In [207]:
rog_df = interpolate_by_deg_dist_pred(rog_df)
rog_df.to_csv(OUTPUT + '/train_rog_df.csv', index=False)

train.loc[rog_df.index, ['latDeg', 'lngDeg']] = rog_df[['latDeg', 'lngDeg']]
train.to_csv(OUTPUT + '/train_ro_rog.csv', index=False)

In [208]:
rog_df.head(100)

Unnamed: 0_level_0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,dist_pred,latDeg_gt,lngDeg_gt,...,dist_next,deg_pred_prev,deg_pred_next,dist_pred_prev,dist_pred_next,g,lat_forward,lng_forward,lat_reversed,lng_reversed
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93986,2021-03-10-US-SVL-1,Pixel4XL,1299453168638,,,24.93,2021-03-10-US-SVL-1_Pixel4XL,,37.371314,-122.04798,...,6.536023,,,,0.0,1.0,,,,
93987,2021-03-10-US-SVL-1,Pixel4XL,1299453169651,37.371295,-122.048026,6.56,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,9.354958,,,0.0,0.0,,,,,
93988,2021-03-10-US-SVL-1,Pixel4XL,1299453170646,37.371211,-122.048033,19.14,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,9.042584,,,0.0,0.0,,,,,
93989,2021-03-10-US-SVL-1,Pixel4XL,1299453171664,37.3712,-122.048134,24.47,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,2.853071,,,0.0,0.0,,,,,
93990,2021-03-10-US-SVL-1,Pixel4XL,1299453172650,37.371194,-122.048102,21.83,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,2.626328,,,0.0,0.0,,,,,
93991,2021-03-10-US-SVL-1,Pixel4XL,1299453173646,37.371203,-122.048075,29.32,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,2.678534,,,0.0,0.0,,,,,
93992,2021-03-10-US-SVL-1,Pixel4XL,1299453174651,37.371226,-122.04808,25.59,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,4.448469,,,0.0,0.0,,,,,
93993,2021-03-10-US-SVL-1,Pixel4XL,1299453175649,37.371223,-122.048029,17.82,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,5.870889,,,0.0,0.0,,,,,
93994,2021-03-10-US-SVL-1,Pixel4XL,1299453176662,37.371255,-122.048082,26.09,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,3.075636,,,0.0,0.0,,,,,
93995,2021-03-10-US-SVL-1,Pixel4XL,1299453177649,37.371228,-122.048085,22.74,2021-03-10-US-SVL-1_Pixel4XL,0.0,37.371314,-122.04798,...,2.250007,,,0.0,0.0,,,,,


In [209]:
rog_df[['latDeg', 'lngDeg', 'g']].to_csv('check.csv')

# kalmanフィルタ

In [None]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

In [None]:
train = apply_kf_smoothing(train)
train.to_csv(OUTPUT + '/train_ro_rog_kf.csv', index=False)

# speed0の処理

In [None]:
def sp0_process(df, sp0_result):
    df = df.merge(sp0_result, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    df['group'] = df.groupby('phone').cumcount()
    df.loc[(df['group']>0)&(df['isSpeed0']==1),'group'] = np.nan
    df['group'] = df['group'].fillna(method='ffill')
    df[['latDeg', 'lngDeg']] = df.groupby(['phone', 'group'])[['latDeg','lngDeg']].transform('mean')
    return df

In [None]:
train_sp0_pred = pd.read_csv('../output/prep/speed0_pred_v001/train_sp0_pred.csv')
train = sp0_process(train, train_sp0_pred)
train.to_csv(OUTPUT + '/train_ro_rog_kf_sp0.csv', index=False)

# phones_mean

In [None]:
def make_lerp_data(df):
    '''
    Generate interpolated lat,lng values for different phone times in the same collection.
    '''
    org_columns = df.columns
    
    # Generate a combination of time x collection x phone and combine it with the original data (generate records to be interpolated)
    time_list = df[['collectionName', 'millisSinceGpsEpoch']].drop_duplicates()
    phone_list =df[['collectionName', 'phoneName']].drop_duplicates()
    tmp = time_list.merge(phone_list, on='collectionName', how='outer')
    
    lerp_df = tmp.merge(df, on=['collectionName', 'millisSinceGpsEpoch', 'phoneName'], how='left')
    lerp_df['phone'] = lerp_df['collectionName'] + '_' + lerp_df['phoneName']
    lerp_df = lerp_df.sort_values(['phone', 'millisSinceGpsEpoch'])
    
    # linear interpolation
    lerp_df['latDeg_prev'] = lerp_df['latDeg'].shift(1)
    lerp_df['latDeg_next'] = lerp_df['latDeg'].shift(-1)
    lerp_df['lngDeg_prev'] = lerp_df['lngDeg'].shift(1)
    lerp_df['lngDeg_next'] = lerp_df['lngDeg'].shift(-1)
    lerp_df['phone_prev'] = lerp_df['phone'].shift(1)
    lerp_df['phone_next'] = lerp_df['phone'].shift(-1)
    lerp_df['time_prev'] = lerp_df['millisSinceGpsEpoch'].shift(1)
    lerp_df['time_next'] = lerp_df['millisSinceGpsEpoch'].shift(-1)
    # Leave only records to be interpolated
    lerp_df = lerp_df[(lerp_df['latDeg'].isnull())&(lerp_df['phone']==lerp_df['phone_prev'])&(lerp_df['phone']==lerp_df['phone_next'])].copy()
    # calc lerp
    lerp_df['latDeg'] = lerp_df['latDeg_prev'] + ((lerp_df['latDeg_next'] - lerp_df['latDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['lngDeg'] = lerp_df['lngDeg_prev'] + ((lerp_df['lngDeg_next'] - lerp_df['lngDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    
    # Leave only the data that has a complete set of previous and next data.
    lerp_df = lerp_df[~lerp_df['latDeg'].isnull()]
    
    return lerp_df[org_columns]

def calc_mean_pred(df, lerp_df):
    '''
    Make a prediction based on the average of the predictions of phones in the same collection.
    '''
    add_lerp = pd.concat([df, lerp_df])
    mean_pred_result = add_lerp.groupby(['collectionName', 'millisSinceGpsEpoch'])[['latDeg', 'lngDeg']].mean().reset_index()
    mean_pred_df = df[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].copy()
    mean_pred_df = mean_pred_df.merge(mean_pred_result[['collectionName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['collectionName', 'millisSinceGpsEpoch'], how='left')
    return mean_pred_df

In [None]:
train_lerp = make_lerp_data(train)
train = calc_mean_pred(train, train_lerp)
train.to_csv(OUTPUT + '/train_ro_rog_kf_sp0_pm.csv', index=False)

# position_shift

In [None]:
train['phone'] = train['collectionName'] + '_' + train['phoneName']

In [None]:
def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a    = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z

transformer = pyproj.Transformer.from_crs(
    {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)

def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt
msge = 'millisSinceGpsEpoch'

In [None]:
def position_shift(df,a):

    d = df.copy()
    d['heightAboveWgs84EllipsoidM'] = 63.5
    d['x'], d['y'], d['z'] = zip(*d.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.heightAboveWgs84EllipsoidM), axis=1))

    #a = -0.2
    d.sort_values(['phone', msge], inplace=True)
    for fi in ['x','y','z']:
        d[[fi+'p']] = d[fi].shift().where(d['phone'].eq(d['phone'].shift()))
        d[[fi+'diff']] = d[fi]-d[fi+'p']
    #d[['yp']] = d['y'].shift().where(d['phone'].eq(d['phone'].shift()))
    d[['dist']] = np.sqrt(d['xdiff']**2 + d['ydiff']**2+ d['zdiff']**2)
    for fi in ['x','y','z']:
        d[[fi+'new']] = d[fi+'p'] + d[fi+'diff']*(1-a/d['dist'])
    lng, lat, alt = ECEF_to_WGS84(d['xnew'].values,d['ynew'].values,d['znew'].values)
    
    lng[np.isnan(lng)] = d.loc[np.isnan(lng),'lngDeg']
    lat[np.isnan(lat)] = d.loc[np.isnan(lat),'latDeg']
    d['latDeg'] = lat
    d['lngDeg'] = lng
    
    return d 

def objective(trial):
    a = trial.suggest_uniform('a', -1, 1)
    score = get_train_score(position_shift(train, a),gt)
    return score

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=30)
opt_a = study.best_params['a']
print(opt_a)

In [None]:
train = position_shift(train, opt_a)
train.to_csv(OUTPUT + '/train_ro_rog_kf_sp0_pm_ps.csv', index=False)

# trainの結果確認

In [None]:
for f in ['train_ro_rog_kf', 'train_ro_rog_kf_sp0', 'train_ro_rog_kf_sp0_pm', 'train_ro_rog_kf_sp0_pm_ps']:
    print(f, get_train_score(pd.read_csv(OUTPUT + '/' + f + '.csv'), gt))

In [None]:
result = train_result(train)

In [None]:
result.err

# make_sub

In [None]:
test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')

In [None]:
test = add_distance_diff(test)
test.loc[((test['dist_prev'] > ro_th) & (test['dist_next'] > ro_th)), ['latDeg', 'lngDeg']] = np.nan

In [None]:
rog_df = test[test['collectionName'].isin(rog_target)].copy()
rog_gt = gt[gt['collectionName'].isin(rog_target)].copy()
rog_df = remove_based_on_gt(rog_df, rog_gt, rog_th)
rog_df = interpolate_by_deg_dist_pred(rog_df)
rog_df.to_csv(OUTPUT + '/test_rog_df.csv', index=False)
test.loc[rog_df.index, ['latDeg', 'lngDeg']] = rog_df[['latDeg', 'lngDeg']]

In [None]:
test = apply_kf_smoothing(test)

In [None]:
test_sp0_pred = pd.read_csv('../output/prep/speed0_pred_v001/test_sp0_pred.csv')
test = sp0_process(test, test_sp0_pred)

In [None]:
test_lerp = make_lerp_data(test)
test = calc_mean_pred(test, test_lerp)

In [None]:
test['phone'] = test['collectionName'] + '_' + test['phoneName']

In [None]:
test = position_shift(test, opt_a)

In [None]:
assert len(test) == len(sub)

In [None]:
sub['latDeg'] = test['latDeg']
sub['lngDeg'] = test['lngDeg']
sub.to_csv(OUTPUT + f'/{nb_name}_submission.csv', index=False)