# exp135_異常値除去検討

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
import scipy.interpolate
import scipy.sparse

In [2]:
area_labeling = pd.read_csv('../output/prep/area_labeling/result.csv')

g1 = list(area_labeling[area_labeling['g']==1]['collectionName'])
g2 = list(area_labeling[area_labeling['g']==2]['collectionName'])
g3 = list(area_labeling[area_labeling['g']==3]['collectionName'])
g4 = list(area_labeling[area_labeling['g']==4]['collectionName'])
g5 = list(area_labeling[area_labeling['g']==5]['collectionName'])

In [3]:
target = g1 + g2

In [4]:
ro_th = 50 # 相対移動距離をもとにした異常値除去の閾値
rog_th = 10 # ground_truthをもとにした異常値除去の閾値

# ground_truthをもとにした異常値除去を行うcollection
rog_target = ['2021-04-22-US-SJC-1', '2021-04-29-US-SJC-2', '2021-04-28-US-SJC-1', '2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3',
              '2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1', '2021-03-16-US-RWC-2', '2021-04-28-US-MTV-2', '2021-04-29-US-MTV-2',
              '2021-04-26-US-SVL-2', '2021-03-10-US-SVL-1', '2021-04-26-US-SVL-1',
              '2021-04-21-US-MTV-1', '2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1']

In [5]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [6]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [7]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [8]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [9]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [10]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [11]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [12]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [13]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [14]:
def update_baseline(df, rb):
    rb = rb.rename(columns={'latDeg':'latDeg_rb', 'lngDeg':'lngDeg_rb'})
    df = df.merge(rb[['millisSinceGpsEpoch', 'phone', 'latDeg_rb', 'lngDeg_rb']], on=['millisSinceGpsEpoch', 'phone'], how='left')

    idx = df[~df['latDeg_rb'].isnull()].index
    df.loc[idx, 'latDeg'] = df.loc[idx, 'latDeg_rb']
    df.loc[idx, 'lngDeg'] = df.loc[idx, 'lngDeg_rb']
    
    df.drop(columns=['latDeg_rb', 'lngDeg_rb'], inplace=True)
    
    return df

In [15]:
def err_map(df_, output_dir):
    df = df_.copy()
    gt = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg_gt', 'lngDeg_gt', 'speedMps', 'courseDegree']], on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df['err'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_gt'], df['lngDeg_gt'])
    
    os.makedirs(output_dir, exist_ok=True)
    for phone in df['phone'].unique():
        tmp = df[df['phone']==phone].copy()
        fig, axes = plt.subplots(figsize=(30, 30))
        sns.scatterplot(x='latDeg', y='lngDeg', hue='err', data=tmp, size='err', s=100, alpha=0.8, lw=0, ax=axes)
        axes.invert_xaxis()
        fig.suptitle(phone, fontsize=16)
        fig.savefig(f'{output_dir}/{phone}.png')
        plt.close()

In [16]:
def err_transition(df_, output_dir):
    df = df_.copy()
    gt = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg_gt', 'lngDeg_gt', 'speedMps', 'courseDegree']], on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df['err'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_gt'], df['lngDeg_gt'])
    df['millis_diff'] = df['millisSinceGpsEpoch'] - df.groupby('phone')['millisSinceGpsEpoch'].shift(1)
    
    os.makedirs(output_dir, exist_ok=True)
    for phone in df['phone'].unique():
        tmp = df[df['phone']==phone].copy()
        fig, axes = plt.subplots(figsize=(40, 20), nrows=4,sharex=True)
        axes[0].plot(tmp['millisSinceGpsEpoch'], tmp['speedMps'], label='speedMps')
        axes[0].legend(loc='upper right')
        axes[0].grid(color='g', linestyle=':', linewidth=0.3)

        axes[1].plot(tmp['millisSinceGpsEpoch'], tmp['courseDegree'], label='courseDegree')
        axes[1].legend(loc='upper right')
        axes[1].grid(color='g', linestyle=':', linewidth=0.3)

        axes[2].plot(tmp['millisSinceGpsEpoch'], tmp['millis_diff'], label='millis_diff')
        axes[2].legend(loc='upper right')
        axes[2].grid(color='g', linestyle=':', linewidth=0.3)

        axes[3].plot(tmp['millisSinceGpsEpoch'], tmp['err'], label='err')
        axes[3].legend(loc='upper right')
        axes[3].grid(color='g', linestyle=':', linewidth=0.3)

        fig.suptitle(phone, fontsize=16)
        fig.savefig(f'{output_dir}/{phone}.png')
        plt.close()

# 自前再構成baselineに更新

In [17]:
train, test, sub, gt = get_data()
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']

train = update_baseline(train, pd.read_csv('../output/prep/baseline_g1_v001/result.csv'))
train = update_baseline(train, pd.read_csv('../output/prep/baseline_g2_v003/result.csv'))
train = update_baseline(train, pd.read_csv('../output/prep/baseline_g3_v003/result.csv'))
train = update_baseline(train, pd.read_csv('../output/prep/baseline_g4_v002/result.csv'))

train = train[train['collectionName'].isin(target)].copy()
train = train.sort_values(['phone', 'millisSinceGpsEpoch']).reset_index(drop=True)

In [18]:
train_sp0_pred = pd.read_csv('../output/prep/speed0_pred_v001/train_sp0_pred.csv', usecols=['phone', 'millisSinceGpsEpoch', 'isSpeed0'])
train = train.merge(train_sp0_pred, on=['phone', 'millisSinceGpsEpoch'], how='left')

dp_train = pd.read_csv('../output/prep/rel_pred_v002/train_result.csv')
train = train.merge(dp_train[['millisSinceGpsEpoch', 'phone', 'lat_diff', 'lng_diff']], on=['millisSinceGpsEpoch', 'phone'], how='left')
train.loc[train['isSpeed0']==1, ['lat_diff', 'lng_diff']] = 0

In [19]:
def add_distance_diff(df):
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev']] = np.nan
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next']] = np.nan
    
    return df

In [20]:
# reject outlier
train = add_distance_diff(train)
train.loc[((train['dist_prev'] > ro_th) & (train['dist_next'] > ro_th)), ['latDeg', 'lngDeg']] = np.nan

In [21]:
def interpolate_remove_point(df):
    out_df = pd.DataFrame()
    phones = df['phone'].unique()
    
    for phone in phones:
        tmp = df[df['phone']==phone].copy()
        tmp = tmp.sort_values('millisSinceGpsEpoch')
        tmp = tmp.reset_index().set_index('millisSinceGpsEpoch')
        tmp[['latDeg', 'lngDeg']] = tmp[['latDeg', 'lngDeg']].interpolate(method='index', limit_area='inside')
        tmp = tmp.sort_values('index')
        tmp = tmp.reset_index().set_index('index')
        out_df = out_df.append(tmp)
    return out_df

In [22]:
train = interpolate_remove_point(train)

In [23]:
err_map(train, f'{OUTPUT}/train/map/before')

In [24]:
def make_th_data(df):
    def quantile_mean(s):
        lth, uth = np.percentile(s, [5, 95]) 
        return s[(s>lth) & (s<uth)].mean()
    def quantile_std(s):
        lth, uth = np.percentile(s, [5, 95]) 
        return s[(s>lth) & (s<uth)].std()
    
    window = 60
    sigma = 2
    output_df = pd.DataFrame()
    
    df = df.sort_values('millisSinceGpsEpoch').reset_index(drop=True)
    for idx in df.index:
        tmp = df[idx : idx + window].copy()
        if not np.isnan(tmp.at[idx, 'latDeg']):        
            tmp['lat_base'] = tmp['lat_diff'].shift(1)
            tmp['lng_base'] = tmp['lng_diff'].shift(1)
            tmp.loc[idx, 'lat_base'] = tmp.loc[idx, 'latDeg']
            tmp.loc[idx, 'lng_base'] = tmp.loc[idx, 'lngDeg']
            tmp['lat_base'] = tmp['lat_base'].cumsum()
            tmp['lng_base'] = tmp['lng_base'].cumsum()
            output_df = output_df.append(tmp[['phone', 'millisSinceGpsEpoch', 'lat_base', 'lng_base']])

    df = df.sort_values('millisSinceGpsEpoch', ascending=False).reset_index(drop=True)
    for idx in df.index:
        tmp = df[idx : idx + window].copy()
        if not np.isnan(tmp.at[idx, 'latDeg']):        
            tmp['lat_base'] = -tmp['lat_diff']
            tmp['lng_base'] = -tmp['lng_diff']
            tmp.loc[idx, 'lat_base'] = tmp.loc[idx, 'latDeg']
            tmp.loc[idx, 'lng_base'] = tmp.loc[idx, 'lngDeg']
            tmp['lat_base'] = tmp['lat_base'].cumsum()
            tmp['lng_base'] = tmp['lng_base'].cumsum()
            output_df = output_df.append(tmp[['phone', 'millisSinceGpsEpoch', 'lat_base', 'lng_base']])
    
    output_df = output_df.groupby(['phone','millisSinceGpsEpoch']).agg({'lat_base':[quantile_mean, quantile_std], 'lng_base':[quantile_mean, quantile_std]}).reset_index()
    output_df.columns = ['phone', 'millisSinceGpsEpoch', 'lat_base_mean', 'lat_base_std', 'lng_base_mean', 'lng_base_std']

    output_df['lat_uth'] = output_df['lat_base_mean'] + (output_df['lat_base_std'] * sigma)
    output_df['lat_lth'] = output_df['lat_base_mean'] - (output_df['lat_base_std'] * sigma)
    output_df['lng_uth'] = output_df['lng_base_mean'] + (output_df['lng_base_std'] * sigma)
    output_df['lng_lth'] = output_df['lng_base_mean'] - (output_df['lng_base_std'] * sigma)

    return output_df[['millisSinceGpsEpoch', 'lat_uth', 'lat_lth', 'lng_uth' , 'lng_lth']]

In [25]:
def reject_outlier_by_base_trajectory(df):
    output_df = pd.DataFrame()
    for phone in df['phone'].unique():
        tmp = df[df['phone']==phone].copy()
        th_data = make_th_data(tmp)
        tmp = tmp.merge(th_data, on=['millisSinceGpsEpoch'], how='left')
        reject_idx = tmp[(tmp['latDeg']>tmp['lat_uth']) | (tmp['latDeg']<tmp['lat_lth']) | (tmp['lngDeg']>tmp['lng_uth']) | (tmp['lngDeg']<tmp['lng_lth'])].index
        tmp.loc[reject_idx, 'latDeg'] = np.nan
        tmp.loc[reject_idx, 'lngDeg'] = np.nan
        output_df = output_df.append(tmp)
        print(phone, len(reject_idx))
    return output_df

In [26]:
train = reject_outlier_by_base_trajectory(train)

2020-05-14-US-MTV-1_Pixel4 120
2020-05-14-US-MTV-1_Pixel4XLModded 148
2020-05-14-US-MTV-2_Pixel4 80
2020-05-14-US-MTV-2_Pixel4XLModded 59
2020-05-21-US-MTV-1_Pixel4 153
2020-05-21-US-MTV-2_Pixel4 104
2020-05-21-US-MTV-2_Pixel4XL 133
2020-05-29-US-MTV-1_Pixel4 123
2020-05-29-US-MTV-1_Pixel4XL 97
2020-05-29-US-MTV-1_Pixel4XLModded 120
2020-05-29-US-MTV-2_Pixel4 136
2020-05-29-US-MTV-2_Pixel4XL 150
2020-06-04-US-MTV-1_Pixel4 113
2020-06-04-US-MTV-1_Pixel4XL 116
2020-06-04-US-MTV-1_Pixel4XLModded 127
2020-06-05-US-MTV-1_Pixel4 235
2020-06-05-US-MTV-1_Pixel4XL 202
2020-06-05-US-MTV-1_Pixel4XLModded 125
2020-06-05-US-MTV-2_Pixel4 117
2020-06-05-US-MTV-2_Pixel4XL 129
2020-06-11-US-MTV-1_Pixel4 138
2020-06-11-US-MTV-1_Pixel4XL 105
2020-07-08-US-MTV-1_Pixel4 197
2020-07-08-US-MTV-1_Pixel4XL 103
2020-07-08-US-MTV-1_Pixel4XLModded 117
2020-07-17-US-MTV-1_Mi8 15
2020-07-17-US-MTV-2_Mi8 0
2020-08-03-US-MTV-1_Mi8 43
2020-08-03-US-MTV-1_Pixel4 168
2020-08-06-US-MTV-2_Mi8 1
2020-08-06-US-MTV-2_Pixel4 

In [27]:
err_map(train, f'{OUTPUT}/train/map/after')