# rel_pred_v002
相対座標予測 + oofにnullがないように

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)

In [2]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/prep/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

In [4]:
area_labeling = pd.read_csv('../output/prep/area_labeling/result.csv')

g1 = list(area_labeling[area_labeling['g']==1]['collectionName'])
g2 = list(area_labeling[area_labeling['g']==2]['collectionName'])
g3 = list(area_labeling[area_labeling['g']==3]['collectionName'])
g4 = list(area_labeling[area_labeling['g']==4]['collectionName'])
g5 = list(area_labeling[area_labeling['g']==5]['collectionName'])

# utils

In [5]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [6]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [7]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [8]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [9]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [10]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [11]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [12]:
def add_features(df):
    for c,i in itertools.product(['millisSinceGpsEpoch'], [1,2,3,4,5,-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        df[col] = df[c].shift(i)
        df[col+'_diff'] = df[c] - df[col]
        df.loc[df['phone']!=df['phone'].shift(i), [col, col+'_diff']] = np.nan
    return df

In [13]:
def add_sensor_features(df, accel, gyro, mag, ori):
    # phoneを追加
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    accel['phone'] = accel['collectionName'] + '_' + accel['phoneName']
    gyro['phone'] = gyro['collectionName'] + '_' + gyro['phoneName']
    mag['phone'] = mag['collectionName'] + '_' + mag['phoneName']
    ori['phone'] = ori['collectionName'] + '_' + ori['phoneName']
    
     # 一定の値しか入っていないphoneを除外しておく
    ori = ori[~ori['phone'].isin(['2021-04-29-US-MTV-1_SamsungS20Ultra', '2021-04-28-US-MTV-1_SamsungS20Ultra', '2021-04-28-US-SJC-1_SamsungS20Ultra', '2021-04-29-US-SJC-2_SamsungS20Ultra',
                                 '2021-04-28-US-MTV-2_SamsungS20Ultra', '2021-04-29-US-SJC-3_SamsungS20Ultra', '2021-04-29-US-MTV-2_SamsungS20Ultra'])]
    
    
    # phonenameをラベルエンコーディング
    phoneName_map = {'Pixel4':1, 'Pixel4XLModded':2, 'Pixel4XL':3, 'Mi8':4, 'Pixel4Modded':5, 'Pixel5':6, 'SamsungS20Ultra':7}
    df['phoneName_le'] = df['phoneName'].map(phoneName_map)
    
    # utc -> gps
    accel['millisSinceGpsEpoch'] = accel['utcTimeMillis'] - 315964800000 + 18000
    gyro['millisSinceGpsEpoch'] = gyro['utcTimeMillis'] - 315964800000 + 18000
    mag['millisSinceGpsEpoch'] = mag['utcTimeMillis'] - 315964800000 + 18000
    ori['millisSinceGpsEpoch'] = ori['utcTimeMillis'] - 315964800000 + 18000
    
    # resampling追加
    df['secondSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    accel['secondSinceGpsEpoch'] = accel['millisSinceGpsEpoch'] // 1000
    gyro['secondSinceGpsEpoch'] = gyro['millisSinceGpsEpoch'] // 1000
    mag['secondSinceGpsEpoch'] = mag['millisSinceGpsEpoch'] // 1000
    ori['secondSinceGpsEpoch'] = ori['millisSinceGpsEpoch'] // 1000
    
    # clipping
    accel[['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2']] = accel.groupby('phone')['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    gyro[['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec']] = gyro.groupby('phone')['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    mag[['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT']] = mag.groupby('phone')['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    
    accel = accel.groupby(['phone', 'secondSinceGpsEpoch'])['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].agg(['mean', 'std']).reset_index()
    accel.columns = ['phone', 'secondSinceGpsEpoch', 'UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std']
    gyro = gyro.groupby(['phone', 'secondSinceGpsEpoch'])['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].agg(['mean', 'std']).reset_index()
    gyro.columns = ['phone', 'secondSinceGpsEpoch', 'UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std' ]
    mag = mag.groupby(['phone', 'secondSinceGpsEpoch'])['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].agg(['mean', 'std']).reset_index()
    mag.columns = ['phone', 'secondSinceGpsEpoch', 'UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std']
    ori = ori.groupby(['phone', 'secondSinceGpsEpoch'])['yawDeg', 'rollDeg', 'pitchDeg'].agg(['mean', 'std']).reset_index()
    ori.columns = ['phone', 'secondSinceGpsEpoch', 'yawDeg_mean', 'yawDeg_std', 'rollDeg_mean', 'rollDeg_std', 'pitchDeg_mean', 'pitchDeg_std']
    
    
    # shift特徴量
    for c, i in itertools.product(['UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        accel[col] = accel[c].shift(i)
        accel[col+'_diff'] = accel[c] - accel[col]
        accel.loc[accel['phone']!=accel['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        gyro[col] = gyro[c].shift(i)
        gyro[col+'_diff'] = gyro[c] - gyro[col]
        gyro.loc[gyro['phone']!=gyro['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        mag[col] = mag[c].shift(i)
        mag[col+'_diff'] = mag[c] - mag[col]
        mag.loc[mag['phone']!=mag['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['yawDeg_mean', 'yawDeg_std', 'rollDeg_mean', 'rollDeg_std', 'pitchDeg_mean', 'pitchDeg_std'], [1,2,3,-1,-2,-3]):
        col = c+ '_s' + str(i)
        ori[col] = ori[c].shift(i)
        ori[col+'_diff'] = ori[c] - ori[col]
        ori.loc[ori['phone']!=ori['phone'].shift(i), [col, col+'_diff']] = np.nan        
    
    df = df.merge(accel, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(gyro, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(mag, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(ori, on=['phone', 'secondSinceGpsEpoch'], how='left')
    
    df.drop(['secondSinceGpsEpoch'], axis=1, inplace=True)
    
    return df

In [14]:
train, test, sub, gt = get_data()
accel_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalAccel.csv')
gyro_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalGyro.csv')
mag_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalMag.csv')
ori_train = pd.read_csv(INPUT + '/prep/gnss/train/OrientationDeg.csv')
accel_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalAccel.csv')
gyro_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalGyro.csv')
mag_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalMag.csv')
ori_test = pd.read_csv(INPUT + '/prep/gnss/test/OrientationDeg.csv')

train = train.merge(gt[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'speedMps']], on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
train = add_features(train)
test = add_features(test)
train = add_sensor_features(train, accel_train, gyro_train, mag_train, ori_train)
test = add_sensor_features(test, accel_test, gyro_test, mag_test, ori_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

# doppler追加

In [15]:
def add_dop_features(df):
    for c,i in itertools.product(['d', 'roll_d', 'lat_rel', 'lng_rel', 'lat_rel_roll', 'lng_rel_roll'], [1,2,3,4,5,-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        df[col] = df[c].shift(i)
        df[col+'_diff'] = df[c] - df[col]
        df.loc[df['phone']!=df['phone'].shift(i), [col, col+'_diff']] = np.nan
    return df

In [16]:
dp_train = pd.read_csv('../output/prep/doppler_processing_v004/train_result.csv')
dp_test = pd.read_csv('../output/prep/doppler_processing_v004/test_result.csv')

In [17]:
train = train.merge(dp_train, on=['phone', 'millisSinceGpsEpoch'], how='left')
test = test.merge(dp_test, on=['phone', 'millisSinceGpsEpoch'], how='left')

In [18]:
train = add_dop_features(train)
test = add_dop_features(test)

# degree追加

In [19]:
deg_train = pd.read_csv(f'../output/prep/degree_pred_v004/train_degree_pred.csv')
deg_test = pd.read_csv(f'../output/prep/degree_pred_v004/test_degree_pred.csv')

In [20]:
train = train.merge(deg_train[['phone', 'millisSinceGpsEpoch', 'calc_deg']], on=['phone', 'millisSinceGpsEpoch'], how='left')
test = test.merge(deg_test[['phone', 'millisSinceGpsEpoch', 'calc_deg']], on=['phone', 'millisSinceGpsEpoch'], how='left')

In [21]:
def add_deg_features(df):
    
    for c,i in itertools.product(['calc_deg'], [1,2,3,4,5,-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        df[col] = df[c].shift(i)
        df[col+'_diff'] = df[c] - df[col]
        df.loc[df['phone']!=df['phone'].shift(i), [col, col+'_diff']] = np.nan
    
    for c in ['calc_deg']:
        df[c+'_s1_diff_sum'] = df[c+'_s1_diff'].fillna(0) + df[c+'_s-1_diff'].fillna(0)
        df[c+'_s2_diff_sum'] = df[c+'_s1_diff_sum'] + df[c+'_s2_diff'].fillna(0) + df[c+'_s-2_diff'].fillna(0)
        df[c+'_s3_diff_sum'] = df[c+'_s2_diff_sum'] + df[c+'_s3_diff'].fillna(0) + df[c+'_s-3_diff'].fillna(0)
        df[c+'_s4_diff_sum'] = df[c+'_s3_diff_sum'] + df[c+'_s4_diff'].fillna(0) + df[c+'_s-4_diff'].fillna(0)
        df[c+'_s5_diff_sum'] = df[c+'_s4_diff_sum'] + df[c+'_s5_diff'].fillna(0) + df[c+'_s-5_diff'].fillna(0)
        
    return df

In [22]:
train = add_deg_features(train)
test = add_deg_features(test)

# ラベル生成

In [23]:
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']

In [24]:
gt['lat_diff'] = gt.groupby('phone')['latDeg'].shift(-1) - gt['latDeg']
gt['lng_diff'] = gt.groupby('phone')['lngDeg'].shift(-1) - gt['lngDeg']

In [25]:
train = train.merge(gt[['phone', 'millisSinceGpsEpoch', 'lat_diff', 'lng_diff']], on=['phone', 'millisSinceGpsEpoch'], how='left')

# Mi8除去

In [26]:
train = train[train['phoneName']!='Mi8'].copy()
test = test[test['phoneName']!='Mi8'].copy()

In [27]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [28]:
oof_org = train.copy()

In [29]:
train = train.dropna(subset=['lat_diff'])
train = train.reset_index(drop=True)

In [30]:
oof_org

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,speedMps,millisSinceGpsEpoch_s1,millisSinceGpsEpoch_s1_diff,...,calc_deg_s-4_diff,calc_deg_s-5,calc_deg_s-5_diff,calc_deg_s1_diff_sum,calc_deg_s2_diff_sum,calc_deg_s3_diff_sum,calc_deg_s4_diff_sum,calc_deg_s5_diff_sum,lat_diff,lng_diff
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,0.0,,,...,0.343130,0.594736,0.386853,0.194152,0.470124,0.805878,1.149008,1.535861,2.999982e-10,-1.700002e-09
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,0.0,1.273529e+12,1000.0,...,0.192702,0.592949,0.194488,-0.112330,0.029272,0.178250,0.370952,0.565440,1.040000e-08,3.400004e-09
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,0.0,1.273529e+12,1000.0,...,0.112667,0.619305,0.086312,-0.022040,-0.230856,-0.119976,-0.007309,0.079003,1.140000e-08,1.600000e-09
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,0.0,1.273529e+12,1000.0,...,0.026531,0.661707,-0.015872,-0.052405,-0.142908,-0.425775,-0.399245,-0.415116,9.899992e-09,-1.600000e-09
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4,0.0,1.273529e+12,1000.0,...,-0.023248,0.622884,0.015576,0.036347,0.014700,-0.115123,-0.481501,-0.465925,1.350000e-08,-3.400004e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117168,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,-8.09,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,1.303760e+12,1000.0,...,0.012499,,,-0.079032,-0.245655,-0.492152,-0.886639,-1.440911,5.000004e-09,0.000000e+00
117169,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,-7.59,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,1.303760e+12,1000.0,...,,,,0.033955,0.073653,0.009144,-0.232514,-0.608608,1.700002e-09,-1.600000e-09
117170,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,-8.35,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,1.303760e+12,1000.0,...,,,,0.050821,0.118980,0.098671,-0.012293,-0.257013,-4.330000e-08,3.300002e-09
117171,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,-8.73,2021-04-29-US-SJC-2_SamsungS20Ultra,0.0,1.303760e+12,1000.0,...,,,,-0.067437,-0.124383,-0.150437,-0.224630,-0.389477,-3.170000e-08,0.000000e+00


# 学習

In [31]:
target1 = 'lat_diff'
target2 = 'lng_diff'
not_use_cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM',
                'phone', 'speedMps', target1, target2]

features = [c for c in train.columns if c not in not_use_cols]

opt_params = {'objective': 'regression', 
              'metric': 'rmse', 
              'learning_rate': 0.1, 
              'seed': 8, 
              'feature_pre_filter': False, 
              'lambda_l1': 0.0, 
              'lambda_l2': 0.0, 
              'num_leaves': 31, 
              'feature_fraction': 0.852, 
              'bagging_fraction': 1.0, 
              'bagging_freq': 0, 
              'min_child_samples': 10, 
              'num_iterations': 20000, 
              'early_stopping_round': 100}

In [32]:
collections = train['collectionName'].unique()
imp = pd.DataFrame()
n = len(collections)
oof = oof_org[['phone', 'millisSinceGpsEpoch']].copy()

for target in [target1, target2]:
    print(target)
    test_preds = np.zeros(len(test))
    oof1 = pd.DataFrame()
    
    for collection in collections:
        print('valid : ', collection)
        tr_idx = train[train['collectionName']!=collection].index
        vl_idx = train[train['collectionName']==collection].index
        oof_idx = oof_org[oof_org['collectionName']==collection].index
        tr_x, tr_y = train[features].iloc[tr_idx], train[target].iloc[tr_idx]
        vl_x, vl_y = train[features].iloc[vl_idx], train[target].iloc[vl_idx]
        oof_x, oof_y = oof_org[features].iloc[oof_idx], oof_org[target].iloc[oof_idx]
        tr_data = lgb.Dataset(tr_x, label=tr_y)
        vl_data = lgb.Dataset(vl_x, label=vl_y)

        model = lgb.train(opt_params, tr_data, valid_sets=[tr_data, vl_data],
                          num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)
        oof_pred = model.predict(oof_x, num_iteration=model.best_iteration)

        oof_tmp = oof_org.iloc[oof_idx].copy()
        oof_tmp[target] = oof_pred
        oof1 = oof1.append(oof_tmp)

        imp_tmp = pd.DataFrame()
        imp_tmp['feature'] = model.feature_name()
        imp_tmp['importance'] = model.feature_importance()
        imp_tmp['valid_collection'] = collection
        imp_tmp['target'] = target
        imp = imp.append(imp_tmp)

        pred = model.predict(test[features], num_iteration=model.best_iteration)
        test_preds += pred / n
    
    test[target] = test_preds
    oof = oof.merge(oof1[['phone', 'millisSinceGpsEpoch', target]], on=['phone', 'millisSinceGpsEpoch'], how='left')

lat_diff
valid :  2020-05-14-US-MTV-1




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 143155
[LightGBM] [Info] Number of data points in the train set: 113624, number of used features: 563
[LightGBM] [Info] Start training from score 0.000006
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 3.25731e-06	valid_1's rmse: 1.615e-06
[200]	training's rmse: 1.89626e-06	valid_1's rmse: 1.53448e-06
[300]	training's rmse: 1.47935e-06	valid_1's rmse: 1.49922e-06
[400]	training's rmse: 1.29516e-06	valid_1's rmse: 1.48315e-06
[500]	training's rmse: 1.17713e-06	valid_1's rmse: 1.45787e-06
[600]	training's rmse: 1.09078e-06	valid_1's rmse: 1.43246e-06
[700]	training's rmse: 1.02029e-06	valid_1's rmse: 1.42501e-06
[800]	training's rmse: 9.58986e-07	valid_1's rmse: 1.41591e-06
[900]	training's rmse: 9.0663e-07	valid_1's rmse: 1.41066e-06
[1000]	training's rmse: 8.59416e-07	valid_1's rmse: 1.40041e-06
[1100]	training's rmse: 8.16907e-07	valid_1's rmse: 1.38885e-06
[120

# 結果出力

In [33]:
oof

Unnamed: 0,phone,millisSinceGpsEpoch,lat_diff,lng_diff
0,2020-05-14-US-MTV-1_Pixel4,1273529463442,1.032035e-07,-7.445366e-07
1,2020-05-14-US-MTV-1_Pixel4,1273529464442,9.811197e-08,-2.181355e-07
2,2020-05-14-US-MTV-1_Pixel4,1273529465442,2.593989e-07,-2.530208e-07
3,2020-05-14-US-MTV-1_Pixel4,1273529466442,-5.922152e-08,-2.211691e-07
4,2020-05-14-US-MTV-1_Pixel4,1273529467442,-9.840499e-08,-2.177723e-07
...,...,...,...,...
117168,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760315000,3.977076e-08,1.845571e-07
117169,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760316000,-8.881692e-07,1.405374e-07
117170,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760317000,-7.167539e-07,-3.277856e-07
117171,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760318000,-4.008742e-06,3.763733e-06


In [34]:
test

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,millisSinceGpsEpoch_s1,millisSinceGpsEpoch_s1_diff,millisSinceGpsEpoch_s2,...,calc_deg_s-4_diff,calc_deg_s-5,calc_deg_s-5_diff,calc_deg_s1_diff_sum,calc_deg_s2_diff_sum,calc_deg_s3_diff_sum,calc_deg_s4_diff_sum,calc_deg_s5_diff_sum,lat_diff,lng_diff
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4,,,,...,-3.185758,98.792953,-3.274840,-2.446308,-6.339783,-10.044898,-13.230656,-16.505496,-8.584776e-08,-9.597964e-07
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.082040,-31.76,2020-05-15-US-MTV-1_Pixel4,1.273609e+12,1000.0,,...,-0.828532,98.735423,-0.771001,0.999141,-0.259665,-0.999116,-1.827648,-2.598649,2.184587e-08,-1.939731e-08
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416653,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4,1.273609e+12,1000.0,1.273609e+12,...,0.676165,98.413595,0.997993,1.635527,6.236719,6.855353,7.531519,8.529512,3.618468e-08,2.008767e-08
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4,1.273609e+12,1000.0,1.273609e+12,...,0.809633,98.290842,0.932386,0.330996,2.020078,6.212998,7.022631,7.955017,3.254796e-08,3.768812e-08
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4,1.273609e+12,1000.0,1.273609e+12,...,0.413029,98.709588,-0.005717,-0.608439,-1.347707,-0.317980,3.280807,3.275091,4.403865e-08,3.790773e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85674,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334539,-121.899383,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra,1.303763e+12,1000.0,1.303763e+12,...,-0.133370,,,0.217078,1.212571,3.168931,5.896757,8.992766,3.589563e-08,8.782139e-09
85675,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334545,-121.899380,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra,1.303763e+12,1000.0,1.303763e+12,...,,,,0.417004,1.542207,3.511334,6.083578,9.325776,8.748485e-08,1.785777e-07
85676,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334551,-121.899371,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra,1.303763e+12,1000.0,1.303763e+12,...,,,,0.074116,0.630747,1.573827,3.259320,5.795563,1.943761e-07,4.080996e-07
85677,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334540,-121.899371,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra,1.303763e+12,1000.0,1.303763e+12,...,,,,-0.008605,-0.154725,0.080157,0.913120,2.488495,-7.897928e-07,-1.572510e-06


In [35]:
cols = ['phone', 'millisSinceGpsEpoch', 'lat_diff', 'lng_diff']
oof[cols].to_csv(OUTPUT+'/train_result.csv', index=False)
test[cols].to_csv(OUTPUT+'/test_result.csv', index=False)