# doppler_processing_v004

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score, mean_squared_error
pd.set_option('display.max_rows', 100)

In [2]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/prep/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [4]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [7]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [8]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [9]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [10]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [11]:
def viz(df_, outdir):
    outdir = OUTPUT + '/' + outdir
    os.makedirs(outdir, exist_ok=True)
    df = df_.copy()
    df['d'] = np.sqrt(df['xVehVel']**2 + df['yVehVel']**2 + df['zVehVel']**2)
    
    for phone in df['phone'].unique():
        gt_tmp = gt[gt['phone']==phone].copy()
        tmp = df[df['phone']==phone].copy()

        fig, axes = plt.subplots(figsize=(40, 15), nrows=3, sharex=True)
        axes[0].plot(gt_tmp['millisSinceGpsEpoch'], gt_tmp['speedMps'], label='speed')
        axes[0].plot(tmp['millisSinceGpsEpoch'], tmp['d'], label='d')
        axes[0].legend(loc='upper right')
        axes[0].grid(color='g', linestyle=':', linewidth=0.3)

        axes[1].plot(gt_tmp['millisSinceGpsEpoch'], gt_tmp['courseDegree'], label='deg')
        axes[1].legend(loc='upper right')
        axes[1].grid(color='g', linestyle=':', linewidth=0.3)

        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['xVehVel'], label='xVel')
        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['yVehVel'], label='yVel')
        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['zVehVel'], label='zVel')
        axes[2].legend(loc='upper right')
        axes[2].grid(color='g', linestyle=':', linewidth=0.3)
        fig.suptitle(phone, fontsize=16)
        fig.savefig(f'{outdir}/{phone}.png')
        plt.close()

In [12]:
def viz_rolling(df_, outdir):
    outdir = OUTPUT + '/' + outdir
    os.makedirs(outdir, exist_ok=True)
    df = df_.copy()
    df['d'] = np.sqrt(df['xVehVel']**2 + df['yVehVel']**2 + df['zVehVel']**2)
    
    for phone in df['phone'].unique():
        gt_tmp = gt[gt['phone']==phone].copy()
        tmp = df[df['phone']==phone].copy()

        fig, axes = plt.subplots(figsize=(40, 15), nrows=3, sharex=True)
        axes[0].plot(gt_tmp['millisSinceGpsEpoch'], gt_tmp['speedMps'], label='speed')
        axes[0].plot(tmp['millisSinceGpsEpoch'], tmp['d'], label='d')
        axes[0].plot(tmp['millisSinceGpsEpoch'], tmp['roll_d'], label='d')
        axes[0].legend(loc='upper right')
        axes[0].grid(color='g', linestyle=':', linewidth=0.3)

        axes[1].plot(gt_tmp['millisSinceGpsEpoch'], gt_tmp['courseDegree'], label='deg')
        axes[1].legend(loc='upper right')
        axes[1].grid(color='g', linestyle=':', linewidth=0.3)

        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['xVehVel'], label='xVel')
        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['yVehVel'], label='yVel')
        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['zVehVel'], label='zVel')
        axes[2].legend(loc='upper right')
        axes[2].grid(color='g', linestyle=':', linewidth=0.3)
        fig.suptitle(phone, fontsize=16)
        fig.savefig(f'{outdir}/{phone}.png')
        plt.close()

In [13]:
def check_status(df_, ignore_mi8=True):
    df = df_.copy()
    if ignore_mi8:
        df = df[df['phoneName']!='Mi8'].copy()
    
    df['d'] = np.sqrt(df['xVehVel']**2 + df['yVehVel']**2 + df['zVehVel']**2)
    na_cnt = df['d'].isnull().sum()
    
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='left')
    df = df.dropna(subset=['d'])
    score = np.sqrt(mean_squared_error(df['speedMps'], df['d']))
    
    print(f'RMSE : {score}  | nullの数 : {na_cnt}')

# データ読み込み

In [14]:
train, test, sub, gt = get_data()
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']

In [15]:
dop_train = pd.read_csv('../output/prep/doppler_v004/result_train.csv')
dop_test = pd.read_csv('../output/prep/doppler_v004/result_test.csv')

dop_train = dop_train.sort_values(['phone', 'millisSinceGpsEpoch'])
dop_test = dop_test.sort_values(['phone', 'millisSinceGpsEpoch'])

In [16]:
viz(dop_train, 'train/00_raw')
viz(dop_test, 'test/00_raw')

# train, testの時間に合わせる

In [17]:
def adjust_seconds(df, dop_df):
    df = df.merge(dop_df[['phone', 'millisSinceGpsEpoch', 'xVehVel', 'yVehVel', 'zVehVel']], on=['phone', 'millisSinceGpsEpoch'], how='outer')
    df = df.sort_values(['phone', 'millisSinceGpsEpoch'])
    
    out = pd.DataFrame()
    for phone in df['phone'].unique():
        tmp = df[df['phone']==phone].copy()
        tmp = tmp.set_index('millisSinceGpsEpoch')
        tmp[['xVehVel', 'yVehVel', 'zVehVel']] = tmp[['xVehVel', 'yVehVel', 'zVehVel']].interpolate(method='index', limit_area='inside')
        tmp = tmp.reset_index()
        out = out.append(tmp)
    
    out = out.dropna(subset=['collectionName'])
    out = out.reset_index(drop=True)
    return out    

In [18]:
train = adjust_seconds(train, dop_train)
test = adjust_seconds(test, dop_test)

In [19]:
viz(train, 'train/01_adjust_seconds')
viz(test, 'test/01_adjust_seconds')
check_status(train)

RMSE : 0.41813347049858096  | nullの数 : 229


# speed変化量が大きすぎるデータを除外

In [20]:
th=5

In [21]:
train['d'] = np.sqrt(train['xVehVel']**2 + train['yVehVel']**2 + train['zVehVel']**2)
test['d'] = np.sqrt(test['xVehVel']**2 + test['yVehVel']**2 + test['zVehVel']**2)

In [22]:
train['d_diff_prev'] = abs(train['d'] - train.groupby('phone')['d'].shift(1))
test['d_diff_prev'] = abs(test['d'] - test.groupby('phone')['d'].shift(1))
train['d_diff_next'] = abs(train['d'] - train.groupby('phone')['d'].shift(-1))
test['d_diff_next'] = abs(test['d'] - test.groupby('phone')['d'].shift(-1))

In [23]:
train.loc[(train['d_diff_prev']>th)&(train['d_diff_next']>th), ['xVehVel', 'yVehVel', 'zVehVel', 'd']] = np.nan 
test.loc[(test['d_diff_prev']>th)&(test['d_diff_next']>th), ['xVehVel', 'yVehVel', 'zVehVel', 'd']] = np.nan 

In [24]:
viz(train, 'train/02_change_diff_reject')
viz(test, 'test/02_change_diff_reject')
check_status(train)

RMSE : 0.32169149585772394  | nullの数 : 290


# speedが大きすぎるデータを除外

In [25]:
train.loc[train['d']>50, ['xVehVel', 'yVehVel', 'zVehVel', 'd']] = np.nan 
test.loc[test['d']>50, ['xVehVel', 'yVehVel', 'zVehVel', 'd']] = np.nan 

In [26]:
viz(train, 'train/03_hi_speed_reject')
viz(test, 'test/03_hi_speed_reject')
check_status(train)

RMSE : 0.32169149585772394  | nullの数 : 290


# 移動平均確認

In [27]:
def rolling_test(df_, window, min_periods, center):
    df = df_.copy()
    df['roll_d'] = df.groupby('phone')['d'].rolling(window, min_periods=min_periods, center=center).mean().values
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='left')
    na_cnt = df['roll_d'].isnull().sum()
    df = df.dropna(subset=['roll_d'])
    score = np.sqrt(mean_squared_error(df['speedMps'], df['roll_d']))
    print(f'RMSE : {score}  | nullの数 : {na_cnt}')
    
    viz_rolling(df, 'train/04_rolling')

In [28]:
rolling_test(train, window=5, min_periods=3, center=True)

RMSE : 0.45051989494688127  | nullの数 : 429


# 移動平均実施

In [29]:
train = train.sort_values(['phone', 'millisSinceGpsEpoch'])
test = test.sort_values(['phone', 'millisSinceGpsEpoch'])

train[['xVehVel_roll', 'yVehVel_roll', 'zVehVel_roll']] = train.groupby('phone')['xVehVel', 'yVehVel', 'zVehVel'].rolling(5, min_periods=3, center=True).mean().values
test[['xVehVel_roll', 'yVehVel_roll', 'zVehVel_roll']] = test.groupby('phone')['xVehVel', 'yVehVel', 'zVehVel'].rolling(5, min_periods=3, center=True).mean().values

  after removing the cwd from sys.path.
  """


In [30]:
train['roll_d'] = np.sqrt(train['xVehVel_roll']**2 + train['yVehVel_roll']**2 + train['zVehVel_roll']**2)
test['roll_d'] = np.sqrt(test['xVehVel_roll']**2 + test['yVehVel_roll']**2 + test['zVehVel_roll']**2)

# 相対座標算出

In [31]:
def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a    = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z

transformer = pyproj.Transformer.from_crs(
    {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)

def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt

In [32]:
train['h'] = 0
test['h'] = 0

In [33]:
def calc_rel(df):
    df['x'], df['y'], df['z'] = zip(*df.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.h), axis=1))
    df['x_add'] = df['x'] + df['xVehVel']
    df['y_add'] = df['y'] + df['yVehVel']
    df['z_add'] = df['z'] + df['zVehVel']
    df['lng_add'], df['lat_add'], df['h_add'] = zip(*df.apply(lambda x: ECEF_to_WGS84(x.x_add, x.y_add, x.z_add), axis=1))
    df['lat_rel'] = df['lat_add'] - df['latDeg']
    df['lng_rel'] = df['lng_add'] - df['lngDeg']
    
    df['x_add_roll'] = df['x'] + df['xVehVel_roll']
    df['y_add_roll'] = df['y'] + df['yVehVel_roll']
    df['z_add_roll'] = df['z'] + df['zVehVel_roll']
    df['lng_add_roll'], df['lat_add_roll'], df['h_add_roll'] = zip(*df.apply(lambda x: ECEF_to_WGS84(x.x_add, x.y_add, x.z_add), axis=1))
    df['lat_rel_roll'] = df['lat_add_roll'] - df['latDeg']
    df['lng_rel_roll'] = df['lng_add_roll'] - df['lngDeg']   
    
    return df

In [34]:
train = calc_rel(train)
test = calc_rel(test)

In [35]:
train.columns

Index(['millisSinceGpsEpoch', 'collectionName', 'phoneName', 'latDeg',
       'lngDeg', 'heightAboveWgs84EllipsoidM', 'phone', 'xVehVel', 'yVehVel',
       'zVehVel', 'd', 'd_diff_prev', 'd_diff_next', 'xVehVel_roll',
       'yVehVel_roll', 'zVehVel_roll', 'roll_d', 'h', 'x', 'y', 'z', 'x_add',
       'y_add', 'z_add', 'lng_add', 'lat_add', 'h_add', 'lat_rel', 'lng_rel',
       'x_add_roll', 'y_add_roll', 'z_add_roll', 'lng_add_roll',
       'lat_add_roll', 'h_add_roll', 'lat_rel_roll', 'lng_rel_roll'],
      dtype='object')

In [36]:
cols = ['millisSinceGpsEpoch', 'phone', 'xVehVel', 'yVehVel', 'zVehVel', 'd', 
        'xVehVel_roll', 'yVehVel_roll', 'zVehVel_roll', 
        'lat_rel', 'lng_rel', 'lat_rel_roll', 'lng_rel_roll', 'roll_d']

In [37]:
train[cols].to_csv(OUTPUT+'/train_result.csv', index=False)
test[cols].to_csv(OUTPUT+'/test_result.csv', index=False)