In [None]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
from scipy.ndimage import gaussian_filter1d
from scipy.interpolate import interp1d
import scipy

# utils

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [None]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            lat="latDeg",
                            lon="lngDeg",
                            color="phoneName",
                            labels="phoneName",
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

apply gauss

In [None]:
#ガウシアンフィルターでlat,lngを平滑化
def apply_gauss_smoothing(df, params):
    SZ_1 = params['sz_1']
    SZ_2 = params['sz_2']
    SZ_CRIT = params['sz_crit'] 
    
    print(np.sqrt(SZ_1))
    print(np.sqrt(SZ_2))

    
    #collectionとphoneのユニークな組み合わせを2次元配列で取得
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    
    for collection, phone in unique_paths:
        #取得したいデータを取り出すためのbool値の配列を得る
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        #unique_pathごとのlat,lngをnumpy配列として取り出す
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        gaussian_filter1d(input=lat or lng,sigma(sqrt(0.85) or sqrt(5.65)))
        
        #data[:,0]は、全ての行のlatを1次元配列で取り出せる。つまり、時間の経過によるlat値の変化
        lat_g1 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_1))
        lon_g1 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_1))
        lat_g2 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_2))
        lon_g2 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_2))
        
        lat_dif = data[1:,0] - data[:-1,0]
        lon_dif = data[1:,1] - data[:-1,1]

        lat_crit = np.append(np.abs(gaussian_filter1d(lat_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lat_dif), np.sqrt(SZ_CRIT)))),[0])
        lon_crit = np.append(np.abs(gaussian_filter1d(lon_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lon_dif), np.sqrt(SZ_CRIT)))),[0])           
            
        df.loc[cond, 'latDeg'] = lat_g1 * lat_crit + lat_g2 * (1.0 - lat_crit)
        df.loc[cond, 'lngDeg'] = lon_g1 * lon_crit + lon_g2 * (1.0 - lon_crit)    
    return df

In [None]:
#collectionが同じなのに端末によって位置が違うのはあり得ない。それがノイズなので、平均をとる
def mean_with_other_phones(df):
    #重複のないcollenctionNameの文字列のnumpy配列を作成
    collections_list = df[['collectionName']].drop_duplicates().to_numpy()
    
    for collection in collections_list:
        #collectionに含まれるphoneのnumpy配列
        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

        phone_data = {}
        corrections = {}
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

        for current in phone_data:
            correction = np.ones(phone_data[current].shape, dtype=np.float)
            correction[:,1:] = phone_data[current][:,1:]
    
            for other in phone_data:
                if other == current:
                    continue

                loc = interp1d(phone_data[other][:,0], 
                               phone_data[other][:,1:], 
                               axis=0, 
                               kind='linear', 
                               copy=False, 
                               bounds_error=None, 
                               fill_value='extrapolate', 
                               assume_sorted=True)
                
                start_idx = 0
                stop_idx = 0
                for idx, val in enumerate(phone_data[current][:,0]):
                    if val < phone_data[other][0,0]:
                        start_idx = idx
                    if val < phone_data[other][-1,0]:
                        stop_idx = idx

                if stop_idx - start_idx > 0:
                    correction[start_idx:stop_idx,0] += 1
                    correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])                    

            correction[:,1] /= correction[:,0]
            correction[:,2] /= correction[:,0]
            
            corrections[current] = correction.copy()
        
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            
            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]            
            
    return df

reject outlier

In [None]:
#ラグ特徴を用いて移動距離データを追加
def add_distance_diff(df):
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev']] = np.nan
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next']] = np.nan
    
    return df

phone mean

In [None]:
def make_lerp_data(df):
    '''
    Generate interpolated lat,lng values for different phone times in the same collection.
    '''
    org_columns = df.columns
    
    # Generate a combination of time x collection x phone and combine it with the original data (generate records to be interpolated)
    time_list = df[['collectionName', 'millisSinceGpsEpoch']].drop_duplicates()
    phone_list =df[['collectionName', 'phoneName']].drop_duplicates()
    tmp = time_list.merge(phone_list, on='collectionName', how='outer')
    
    lerp_df = tmp.merge(df, on=['collectionName', 'millisSinceGpsEpoch', 'phoneName'], how='left')
    lerp_df['phone'] = lerp_df['collectionName'] + '_' + lerp_df['phoneName']
    lerp_df = lerp_df.sort_values(['phone', 'millisSinceGpsEpoch'])
    
    # linear interpolation
    lerp_df['latDeg_prev'] = lerp_df['latDeg'].shift(1)
    lerp_df['latDeg_next'] = lerp_df['latDeg'].shift(-1)
    lerp_df['lngDeg_prev'] = lerp_df['lngDeg'].shift(1)
    lerp_df['lngDeg_next'] = lerp_df['lngDeg'].shift(-1)
    lerp_df['phone_prev'] = lerp_df['phone'].shift(1)
    lerp_df['phone_next'] = lerp_df['phone'].shift(-1)
    lerp_df['time_prev'] = lerp_df['millisSinceGpsEpoch'].shift(1)
    lerp_df['time_next'] = lerp_df['millisSinceGpsEpoch'].shift(-1)
    # Leave only records to be interpolated
    lerp_df = lerp_df[(lerp_df['latDeg'].isnull())&(lerp_df['phone']==lerp_df['phone_prev'])&(lerp_df['phone']==lerp_df['phone_next'])].copy()
    # calc lerp
    lerp_df['latDeg'] = lerp_df['latDeg_prev'] + ((lerp_df['latDeg_next'] - lerp_df['latDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['lngDeg'] = lerp_df['lngDeg_prev'] + ((lerp_df['lngDeg_next'] - lerp_df['lngDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    
    # Leave only the data that has a complete set of previous and next data.
    lerp_df = lerp_df[~lerp_df['latDeg'].isnull()]
    
    return lerp_df[org_columns]

def calc_mean_pred(df, lerp_df):
    '''
    Make a prediction based on the average of the predictions of phones in the same collection.
    '''
    add_lerp = pd.concat([df, lerp_df])
    mean_pred_result = add_lerp.groupby(['collectionName', 'millisSinceGpsEpoch'])[['latDeg', 'lngDeg']].mean().reset_index()
    mean_pred_df = df[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].copy()
    mean_pred_df = mean_pred_df.merge(mean_pred_result[['collectionName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['collectionName', 'millisSinceGpsEpoch'], how='left')
    return mean_pred_df

In [None]:
#評価
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

# data prep

In [None]:
# directory setting
INPUT = '../input/google-smartphone-decimeter-challenge'

In [None]:
base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

In [None]:
# ground_truth
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in tqdm(gt_files):
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)

display(ground_truth.head())

In [None]:
smoothed_baseline = apply_gauss_smoothing(base_train, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})

In [None]:
base_train = mean_with_other_phones(smoothed_baseline)

# reject outlier

In [None]:
# reject outlier
train_ro = add_distance_diff(base_train)
th = 50
train_ro.loc[((train_ro['dist_prev'] > th) & (train_ro['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan

# kalman filter
https://www.kaggle.com/emaerthin/demonstration-of-the-kalman-filter

In [None]:
!pip install simdkalman

In [None]:
import simdkalman

In [None]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

In [None]:
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
train_ro_kf = apply_kf_smoothing(base_train[cols])

# phones mean prediction

In [None]:
train_ro_kf

In [None]:
train_lerp = make_lerp_data(train_ro_kf)
print(train_lerp)

In [None]:
train_mean_pred = calc_mean_pred(train_ro_kf, train_lerp)

In [None]:
train_mean_pred

In [None]:
"""tmp1 = train_ro_kf.copy()
tmp2 = train_mean_pred.copy()
tmp2['phoneName'] = tmp2['phoneName'] + '_MEAN'
tmp3 = ground_truth.copy()
tmp3['phoneName'] = tmp3['phoneName'] + '_GT'
tmp = pd.concat([tmp1, tmp2, tmp3])
visualize_collection(tmp, '2020-05-14-US-MTV-1')"""

In [None]:
train_mean_pred["phone"] = train_mean_pred["collectionName"]+"_"+["phoneName"]

# Psition_shift

In [None]:
base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')

In [None]:
import pyproj
from pyproj import Proj, transform

def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def compute_dist(fname, fname2 = 'gt.csv'):
    oof = pd.read_csv(fname)
    gt = pd.read_csv(fname2)
    #fname(base_train)に対してphoneとtimeをキーとしてgtとマージさせる
    #trainのカラムには_x、gtのカラムには_yがつく
    df = oof.merge(gt, on = ['phone','millisSinceGpsEpoch'])
    #ハバーシン式でtrainとgtの誤差の2点間の距離を計算
    dst_oof = calc_haversine(df.latDeg_x,df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
    #scores=phoneとdstで成り立つdf
    scores = pd.DataFrame({'phone': df.phone,'dst': dst_oof})
    #phoneでgroup化
    scores_grp = scores.groupby('phone')
    #quantile(分位数) .50は中央値を出力
    d50 = scores_grp.quantile(.50).reset_index()
    d50.columns = ['phone','q50']
    d95 = scores_grp.quantile(.95).reset_index()
    d95.columns = ['phone','q95']
    #各phoneの50,95パーセンタイル値の平均を足して2で割ったものと、d50とd95をマージしたものを出力
    return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)

#ECEFは、位置をX、Y、およびZ座標として表します。原点は地球の重心として定義。
def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a    = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z

transformer = pyproj.Transformer.from_crs(
    {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)

#ECEFからWGS84に
def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt

sub_columns = sample_sub.columns
base_train[sub_columns].to_csv('btrain.csv',index = False)
#train_mean_pred[sub_columns].to_csv('train_mean_pred.csv',index = False)
base_test[sub_columns].to_csv('btest.csv',index = False)

msge = 'millisSinceGpsEpoch'
gt = ground_truth
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
gt[sub_columns].to_csv('gt.csv', index = False)
#WGS84楕円体上の高さの特徴
gt['heightAboveWgs84EllipsoidM'].describe()

In [None]:
score, scores = compute_dist('btrain.csv','gt.csv')
print(score)
scores

In [None]:
#ハイパーパラメーターの最適化の為に作られたベイズ最適化package
import optuna

#
def position_shift(fname,a):
    
    d = pd.read_csv(fname)
    #WGS84楕円体の高さを中央値で統一
    d['heightAboveWgs84EllipsoidM'] = 63.52
    d['x'], d['y'], d['z'] = zip(*d.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.heightAboveWgs84EllipsoidM), axis=1))

    #a = -0.2
    d.sort_values(['phone', msge], inplace=True)
    for fi in ['x','y','z']:
        d[[fi+'p']] = d[fi].shift().where(d['phone'].eq(d['phone'].shift()))
        d[[fi+'diff']] = d[fi]-d[fi+'p']
    #d[['yp']] = d['y'].shift().where(d['phone'].eq(d['phone'].shift()))
    d[['dist']] = np.sqrt(d['xdiff']**2 + d['ydiff']**2+ d['zdiff']**2)
    for fi in ['x','y','z']:
        d[[fi+'new']] = d[fi+'p'] + d[fi+'diff']*(1-a/d['dist'])
    lng, lat, alt = ECEF_to_WGS84(d['xnew'].values,d['ynew'].values,d['znew'].values)
    
    lng[np.isnan(lng)] = d.loc[np.isnan(lng),'lngDeg']
    lat[np.isnan(lat)] = d.loc[np.isnan(lat),'latDeg']
    d['latDeg'] = lat
    d['lngDeg'] = lng
    
    d.sort_values(['phone',msge],inplace = True)
    ffname = 'shifted_' + fname
    d[sub_columns].to_csv(ffname, index = False)
    return ffname 
def objective(trial):
    a = trial.suggest_uniform('a', -1, 1)
    score, scores = compute_dist(position_shift('btrain.csv', a),'gt.csv')
    return score

#optuna.studeyインスタンスを作る
study = optuna.create_study()
#optimizeに関数を渡して30回実行する
study.optimize(objective, n_trials=50)

In [None]:
study.best_params

# position_shiftのcsv化
csvに学習したパラメータを入れる

In [None]:
def phoneTo(df):
    df["collectionName"],df["phoneName"]=base_train["collectionName"],base_train["phoneName"]
    df=df.drop(["phone"],axis=1)
    return df

# evaluate train score

In [None]:
print('kf + reject_outlier : ', get_train_score(train_ro_kf, ground_truth))
print('+ phones_mean_pred : ', get_train_score(train_mean_pred, ground_truth))
print('+ phones_mean_pred + position_shift : ',get_train_score(stmp , ground_truth))

In [None]:
base_train.to_csv('base_train_Shift.csv',index = False)

In [None]:
position_shift('base_train_Shift.csv', a = study.best_params['a'])

In [None]:
shifted_base = pd.read_csv("shifted_base_train_Shift.csv")

In [None]:
#多分検証用
smoothed_baseline = apply_gauss_smoothing(base_train, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})
base_train = mean_with_other_phones(smoothed_baseline)
base_train[sub_columns].to_csv('smooth.csv',index = False)
position_shift('smooth.csv', a = study.best_params['a'])
smoothbase_train = pd.read_csv("./smooth.csv")
base_train["latDeg"],base_train["lngDeg"] = shifted_base['latDeg'],shifted_base['lngDeg']
base_train = add_distance_diff(base_train)
print(len(base_train))
th = 50
base_train.loc[((base_train['dist_prev'] > th) & (base_train['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan
print(len(base_train))
train_kf = apply_kf_smoothing(base_train)
train_lerp = make_lerp_data(train_kf)
train_mean_pred = calc_mean_pred(train_kf, train_lerp)
print(len(train_mean_pred))

#sample_sub['latDeg'] = train_mean_pred['latDeg']
#sample_sub['lngDeg'] = train_mean_pred['lngDeg']
#sample_sub
train_mean_pred["phone"] = train_mean_pred["collectionName"]+"_"+train_mean_pred["phoneName"]
train_mean_pred.to_csv('train_mean_pred.csv', index=False)
position_shift('train_mean_pred.csv', a = study.best_params['a'])
stmp = pd.read_csv('./shifted_train_mean_pred.csv')
stmp["collectionName"],stmp["phoneName"] = base_train["collectionName"],base_train["phoneName"]

In [None]:
#多分検証用
smoothed_baseline = apply_gauss_smoothing(base_train, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})
base_train = mean_with_other_phones(smoothed_baseline)
base_train[sub_columns].to_csv('smooth.csv',index = False)
position_shift('smooth.csv', a = study.best_params['a'])
smoothbase_train = pd.read_csv("./smooth.csv")
base_train["latDeg"],base_train["lngDeg"] = shifted_base['latDeg'],shifted_base['lngDeg']
base_train = add_distance_diff(base_train)
print(len(base_train))
th = 50
base_train.loc[((base_train['dist_prev'] > th) & (base_train['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan
print(len(base_train))
train_kf = apply_kf_smoothing(base_train)
train_lerp = make_lerp_data(train_kf)
train_mean_pred = calc_mean_pred(train_kf, train_lerp)
print(len(train_mean_pred))

#sample_sub['latDeg'] = train_mean_pred['latDeg']
#sample_sub['lngDeg'] = train_mean_pred['lngDeg']
#sample_sub
train_mean_pred["phone"] = train_mean_pred["collectionName"]+"_"+train_mean_pred["phoneName"]
train_mean_pred.to_csv('train_mean_pred.csv', index=False)
position_shift('train_mean_pred.csv', a = study.best_params['a'])
stmp = pd.read_csv('./shifted_train_mean_pred.csv')
stmp["collectionName"],stmp["phoneName"] = base_train["collectionName"],base_train["phoneName"]

# make submission

In [None]:
#make submittion
sample_sub['latDeg'] = base_test['latDeg']
sample_sub['lngDeg'] = base_test['lngDeg']
smoothed_baseline = apply_gauss_smoothing(base_test, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})
sample_sub['latDeg'] = smoothed_baseline['latDeg']
sample_sub['lngDeg'] = smoothed_baseline['lngDeg']
base_test = mean_with_other_phones(smoothed_baseline)
base_test = add_distance_diff(base_test)
th = 50
base_test.loc[((base_test['dist_prev'] > th) & (base_test['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan

test_kf = apply_kf_smoothing(base_test)
test_lerp = make_lerp_data(base_kf)
test_mean_pred = calc_mean_pred(base_kf, test_lerp)

sample_sub['latDeg'] = test_mean_pred['latDeg']
sample_sub['lngDeg'] = test_mean_pred['lngDeg']
sample_sub.to_csv('test_mean_pred_without_ol,kf.csv', index=False)

In [None]:
sample_sub

In [None]:
position_shift('mean_with_other.csv', a = study.best_params['a'])
position_shift('test_mean_pred.csv', a = study.best_params['a'])