# exp136_rel_pred評価2

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
import scipy.interpolate
import scipy.sparse

In [2]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [4]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [7]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [8]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [9]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [29]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [53]:
gt = get_ground_truth()
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
gt['lat_diff'] = gt.groupby('phone')['latDeg'].shift(-1) - gt['latDeg']
gt['lng_diff'] = gt.groupby('phone')['lngDeg'].shift(-1) - gt['lngDeg']
gt = gt.dropna(subset=['lat_diff'])

In [57]:
dp = pd.read_csv('../output/prep/rel_pred_v002/train_result.csv')
dp = dp.rename(columns={'lat_diff':'lat_diff_pred', 'lng_diff':'lng_diff_pred'})
df = gt.merge(dp, on=['phone', 'millisSinceGpsEpoch'], how='inner')

In [58]:
df['lat_diff_pred_sum'] = df['lat_diff_pred']
df['lng_diff_pred_sum'] = df['lng_diff_pred']

In [59]:
#print(f'lat_total_pred0 : ', np.sqrt(mean_squared_error(df['lat_diff'], df[f'lat_diff_pred'])))
print(f'lng_total_pred0 : ', np.sqrt(mean_squared_error(df['lng_diff'], df[f'lng_diff_pred'])))

for i, s in enumerate(['2', '6', '8', '12', '13', '14']):
    i = i + 1
    dp_tmp = pd.read_csv(f'../output/prep/rel_pred_v002_s{s}/train_result.csv')
    dp_tmp = dp_tmp.rename(columns={'lat_diff':'lat_diff_pred', 'lng_diff':'lng_diff_pred'})
    tmp = gt.merge(dp_tmp, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    df['lat_diff_pred_sum'] += tmp['lat_diff_pred']
    df['lng_diff_pred_sum'] += tmp['lng_diff_pred']
    df['lat_diff_pred'] = df['lat_diff_pred_sum'] / (i + 1)
    df['lng_diff_pred'] = df['lng_diff_pred_sum'] / (i + 1)
    #print(f'lat_total_pred{i} : ', np.sqrt(mean_squared_error(df['lat_diff'], df[f'lat_diff_pred'])))
    print(f'lng_total_pred{i} : ', np.sqrt(mean_squared_error(df['lng_diff'], df[f'lng_diff_pred'])))

lng_total_pred0 :  8.251011628083893e-06
lng_total_pred1 :  8.166795748799905e-06
lng_total_pred2 :  8.138437018588757e-06
lng_total_pred3 :  8.14030347905087e-06
lng_total_pred4 :  8.147439226783996e-06
lng_total_pred5 :  8.13996631693178e-06
lng_total_pred6 :  8.146599465479595e-06


In [None]:
dp42 = pd.read_csv('../output/prep/rel_pred_v002/train_result.csv')
dp2 = pd.read_csv('../output/prep/rel_pred_v002_s2/train_result.csv')
dp13 = pd.read_csv('../output/prep/rel_pred_v002_s13/train_result.csv')

In [18]:
dp42 = dp42.rename(columns={'lat_diff':'lat_diff_pred1', 'lng_diff':'lng_diff_pred1'})
dp2 = dp2.rename(columns={'lat_diff':'lat_diff_pred2', 'lng_diff':'lng_diff_pred2'})
dp13 = dp13.rename(columns={'lat_diff':'lat_diff_pred3', 'lng_diff':'lng_diff_pred3'})

In [19]:
df = gt.merge(dp42, on=['phone', 'millisSinceGpsEpoch'], how='inner')
df = df.merge(dp2, on=['phone', 'millisSinceGpsEpoch'], how='inner')
df = df.merge(dp13, on=['phone', 'millisSinceGpsEpoch'], how='inner')

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree,phone,lat_diff,lng_diff,lat_diff_pred1,lng_diff_pred1,lat_diff_pred2,lng_diff_pred2,lat_diff_pred3,lng_diff_pred3
0,2020-05-21-US-MTV-2,Pixel4,1274131364434,37.628927,-122.426295,65.36,64.43,2.60,0.0,0.0,154.3,2020-05-21-US-MTV-2_Pixel4,0.000000e+00,1.600000e-09,-4.025045e-07,-4.139925e-07,-6.397906e-07,-1.786120e-06,-1.556622e-07,-9.852394e-07
1,2020-05-21-US-MTV-2,Pixel4,1274131365434,37.628927,-122.426295,65.36,65.43,2.60,0.0,0.0,154.3,2020-05-21-US-MTV-2_Pixel4,0.000000e+00,0.000000e+00,-2.355571e-07,-5.440607e-08,-8.040144e-07,-5.121769e-07,3.803672e-08,-4.954968e-07
2,2020-05-21-US-MTV-2,Pixel4,1274131366434,37.628927,-122.426295,65.36,66.43,2.60,0.0,0.0,154.3,2020-05-21-US-MTV-2_Pixel4,0.000000e+00,0.000000e+00,-4.977266e-08,-1.669316e-07,-1.179583e-07,-3.101186e-07,-1.507230e-07,-2.744809e-07
3,2020-05-21-US-MTV-2,Pixel4,1274131367434,37.628927,-122.426295,65.36,67.43,2.60,0.0,0.0,154.3,2020-05-21-US-MTV-2_Pixel4,0.000000e+00,0.000000e+00,-2.611246e-09,-2.519251e-07,-2.881519e-08,-1.406822e-07,-1.046037e-07,6.541204e-08
4,2020-05-21-US-MTV-2,Pixel4,1274131368434,37.628927,-122.426295,65.36,68.43,2.60,0.0,0.0,154.3,2020-05-21-US-MTV-2_Pixel4,0.000000e+00,0.000000e+00,4.288868e-08,-2.893320e-08,6.661692e-08,1.025676e-07,-4.294818e-09,8.310225e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117103,2020-05-29-US-MTV-2,Pixel4XL,1274832058447,37.415923,-122.080712,37.54,2119.45,0.70,0.0,0.0,303.1,2020-05-29-US-MTV-2_Pixel4XL,9.000019e-10,0.000000e+00,-6.520392e-06,2.877224e-06,2.357255e-06,-4.747137e-06,-2.801755e-07,-9.273281e-08
117104,2020-05-29-US-MTV-2,Pixel4XL,1274832059447,37.415923,-122.080712,37.54,2120.45,0.95,0.0,0.0,303.1,2020-05-29-US-MTV-2_Pixel4XL,-1.700002e-09,-8.000001e-10,-6.515008e-06,2.665533e-07,2.297381e-07,-1.102084e-06,-1.334583e-06,-9.273281e-08
117105,2020-05-29-US-MTV-2,Pixel4XL,1274832060447,37.415923,-122.080712,37.54,2121.45,1.40,0.0,0.0,303.1,2020-05-29-US-MTV-2_Pixel4XL,0.000000e+00,0.000000e+00,-4.530350e-06,2.665533e-07,-2.931326e-06,7.021653e-07,1.509637e-06,-9.273281e-08
117106,2020-05-29-US-MTV-2,Pixel4XL,1274832061447,37.415923,-122.080712,37.54,2122.45,1.60,0.0,0.0,303.1,2020-05-29-US-MTV-2_Pixel4XL,2.500002e-09,-5.800004e-09,-3.881492e-06,-3.193258e-06,-5.282747e-06,8.273383e-06,1.271660e-07,7.811511e-06


In [28]:
print(f'lat_total_pred1 : ', np.sqrt(mean_squared_error(df['lat_diff'], df[f'lat_diff_pred1'])))
print(f'lat_total_pred2 : ', np.sqrt(mean_squared_error(df['lat_diff'], (df[f'lat_diff_pred1']+df[f'lat_diff_pred2'])/2)))
print(f'lat_total_pred3 : ', np.sqrt(mean_squared_error(df['lat_diff'], (df[f'lat_diff_pred1']+df[f'lat_diff_pred2']+df[f'lat_diff_pred3'])/3)))

print(f'lng_total_pred1 : ', np.sqrt(mean_squared_error(df['lng_diff'], df[f'lng_diff_pred1'])))
print(f'lng_total_pred2 : ', np.sqrt(mean_squared_error(df['lng_diff'], (df[f'lng_diff_pred1']+df[f'lng_diff_pred2'])/2)))
print(f'lng_total_pred3 : ', np.sqrt(mean_squared_error(df['lng_diff'], (df[f'lng_diff_pred1']+df[f'lng_diff_pred2']+df[f'lng_diff_pred3'])/3)))

lat_total_pred1 :  1.0615589406119969e-05
lat_total_pred2 :  1.0576017103771797e-05
lat_total_pred3 :  1.0564566566057711e-05
lng_total_pred1 :  8.251011628083893e-06
lng_total_pred2 :  8.166795748799905e-06
lng_total_pred3 :  8.145859260303413e-06
