# exp057
deg計算にセンサ＋baselineの特徴量加えて機械学習で予測(バミューダのみ)

In [1]:
import os
import pandas as pd
import numpy as np
import ipynb_path
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
import plotly
import plotly.express as px
%matplotlib inline
pd.set_option('display.max_rows', 200)
from math import * 
import warnings
warnings.simplefilter('ignore')
import pathlib
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score

In [2]:
def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [4]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [5]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=1000,
                            width=2000)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()

In [6]:
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [7]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [8]:
# lowpass filter

from scipy.signal import butter, lfilter

def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

order = 3
fs = 50.0
cutoff = 2.5

In [9]:
# Offset correction
# refarence https://github.com/J-ROCKET-BOY/SS-Fitting

def SS_fit(data) : 

    x = data[:,[0]]
    y = data[:,[1]]
    z = data[:,[2]]

    data_len = len(x)
    
    x2 = np.power(x,2)
    y2 = np.power(y,2)
    z2 = np.power(z,2)

    r1 = -x*(x2+y2+z2)
    r2= -y*(x2+y2+z2)
    r3 = -z*(x2+y2+z2)
    r4 = -(x2+y2+z2)

    left = np.array([[np.sum(x2),np.sum(x*y),np.sum(x*z),np.sum(x)],
                     [np.sum(x*y),np.sum(y2),np.sum(y*z),np.sum(y)],
                     [np.sum(x*z),np.sum(y*z),np.sum(z2),np.sum(z)],
                     [np.sum(x), np.sum(y), np.sum(z), data_len]])
    
    right = np.array([np.sum(r1),
                      np.sum(r2),
                      np.sum(r3),
                      np.sum(r4)])
    
    si = np.dot(np.linalg.inv(left),right)

    x0 = (-1/2)* si[0]
    y0 = (-1/2)* si[1]
    z0 = (-1/2)* si[2]
    
    return np.array([x0,y0,z0])

In [10]:
# Vincenty's formulae
# refarence https://qiita.com/r-fuji/items/99ca549b963cedc106ab

def vincenty_inverse(lat1, lon1, lat2, lon2):

    # Not advanced
    if isclose(lat1, lat2) and isclose(lon1, lon2):
        return False
    
    # WGS84
    a = 6378137.0
    ƒ = 1 / 298.257223563
    b = (1 - ƒ) * a

    lat_1 = atan((1 - ƒ) * tan(radians(lat1)))
    lat_2 = atan((1 - ƒ) * tan(radians(lat2)))
    
    lon_diff = radians(lon2) - radians(lon1)
    λ = lon_diff

    for i in range(1000):
        sinλ = sin(λ)
        cosλ = cos(λ)
        sinσ = sqrt((cos(lat_2) * sinλ) ** 2 + (cos(lat_1) * sin(lat_2) - sin(lat_1) * cos(lat_2) * cosλ) ** 2)
        cosσ = sin(lat_1) * sin(lat_2) + cos(lat_1) * cos(lat_2) * cosλ
        σ = atan2(sinσ, cosσ)
        sinα = cos(lat_1) * cos(lat_2) * sinλ / sinσ
        cos2α = 1 - sinα ** 2
        cos2σm = cosσ - 2 * sin(lat_1) * sin(lat_2) / cos2α
        C = ƒ / 16 * cos2α * (4 + ƒ * (4 - 3 * cos2α))
        λʹ = λ
        λ = lon_diff + (1 - C) * ƒ * sinα * (σ + C * sinσ * (cos2σm + C * cosσ * (-1 + 2 * cos2σm ** 2)))
        
        if abs(λ - λʹ) <= 1e-12:
            break
    else:
        return None

    α = atan2(cos(lat_2) * sinλ, cos(lat_1) * sin(lat_2) - sin(lat_1) * cos(lat_2) * cosλ)

    if α < 0:
        α = α + pi * 2

    return degrees(α)

In [11]:
def calc3(row):
    deg = - degrees(atan2(-1*row['calc2'],row['calc1']))
    if deg < 0:
        deg += 360
    return deg 

In [12]:
def calc_degree_by_gt(df):
    phones = df['phone'].unique()
    df['deg'] = np.nan
    
    for idx in range(len(df)-1):
        if df.at[idx, 'phone'] != df.at[idx+1, 'phone']:
            continue
            
        lat = df.at[idx, 'latDeg_gt']
        lng = df.at[idx, 'lngDeg_gt']
        lat_next = df.at[idx+1, 'latDeg_gt']
        lng_next = df.at[idx+1, 'lngDeg_gt']
        
        res = vincenty_inverse(lat, lng, lat_next, lng_next)
        if res:
            df.at[idx, 'deg'] = res
    
    return df

In [13]:
def calc_degree_by_imu(df, accel, mag):
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    accel['phone'] = accel['collectionName'] + '_' + accel['phoneName']
    mag['phone'] = mag['collectionName'] + '_' + mag['phoneName']
    
    # utc -> gps
    accel['millisSinceGpsEpoch'] = accel['utcTimeMillis'] - 315964800000 + 18000
    mag['millisSinceGpsEpoch'] = mag['utcTimeMillis'] - 315964800000 + 18000
    
    # resampling追加
    df['secondSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    accel['secondSinceGpsEpoch'] = accel['millisSinceGpsEpoch'] // 1000
    mag['secondSinceGpsEpoch'] = mag['millisSinceGpsEpoch'] // 1000
    
    # clipping
    accel[['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2']] = accel.groupby('phone')['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    mag[['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT']] = mag.groupby('phone')['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    
    #     acce filtering and smooting
    accel["global_x"] = accel["UncalAccelZMps2"]
    accel["global_y"] = accel["UncalAccelXMps2"]
    accel["global_z"] = accel["UncalAccelYMps2"]
    accel["x_f"] = butter_lowpass_filter(accel["global_x"], cutoff, fs, order)
    accel["y_f"] = butter_lowpass_filter(accel["global_y"], cutoff, fs, order)
    accel["z_f"] = butter_lowpass_filter(accel["global_z"], cutoff, fs, order)
    
    mag["global_mx"] = mag["UncalMagZMicroT"]
    mag["global_my"] = mag["UncalMagYMicroT"]
    mag["global_mz"] = mag["UncalMagXMicroT"]

    
    output_df = pd.DataFrame()
    for phone in mag['phone'].unique():
        df_tmp = df[df['phone']==phone]
        accel_tmp = accel[accel['phone']==phone].copy()
        mag_tmp = mag[mag['phone']==phone].copy()
    
        smooth_range = 1000
        accel_tmp["x_f"] = accel_tmp.groupby('phone')["x_f"].rolling(smooth_range, center=True, min_periods=1).mean().values
        accel_tmp["y_f"] = accel_tmp.groupby('phone')["y_f"].rolling(smooth_range, center=True, min_periods=1).mean().values
        accel_tmp["z_f"] = accel_tmp.groupby('phone')["z_f"].rolling(smooth_range, center=True, min_periods=1).mean().values

        mag_tmp["global_mx"] = mag_tmp.groupby('phone')["global_mx"].rolling(smooth_range,  min_periods=1).mean().values
        mag_tmp["global_my"] = mag_tmp.groupby('phone')["global_mz"].rolling(smooth_range,  min_periods=1).mean().values
        mag_tmp["global_mz"] = mag_tmp.groupby('phone')["global_my"].rolling(smooth_range,  min_periods=1).mean().values

        offset = SS_fit(np.array(mag_tmp[["global_mx","global_my","global_mz"]]))
        mag_tmp["global_mx"] = (mag_tmp["global_mx"] - offset[0])*-1
        mag_tmp["global_my"] = mag_tmp["global_my"] - offset[1]
        mag_tmp["global_mz"] = mag_tmp["global_mz"] - offset[2]
        
        accel_tmp = accel_tmp.groupby(['phone', 'secondSinceGpsEpoch'])['x_f', 'y_f', 'z_f'].mean().reset_index()
        accel_tmp.columns = ['phone', 'secondSinceGpsEpoch', 'x_f', 'y_f', 'z_f']
    
        mag_tmp = mag_tmp.groupby(['phone', 'secondSinceGpsEpoch'])['global_mx', 'global_my', 'global_mz'].mean().reset_index()
        mag_tmp.columns = ['phone', 'secondSinceGpsEpoch', 'global_mx', 'global_my', 'global_mz']    

        df_tmp = df_tmp.merge(accel_tmp, on=['phone', 'secondSinceGpsEpoch'], how='left')
        df_tmp = df_tmp.merge(mag_tmp, on=['phone', 'secondSinceGpsEpoch'], how='left')
        
        start_mean_range = 10
        x_start_mean = df_tmp[:start_mean_range]["x_f"].mean()
        y_start_mean = df_tmp[:start_mean_range]["y_f"].mean()
        z_start_mean = df_tmp[:start_mean_range]["z_f"].mean() 

        #     roll and picth, device tilt
        r = atan(y_start_mean/z_start_mean)
        p = atan(x_start_mean/(y_start_mean**2 + z_start_mean**2)**0.5)

    #     calculation　degrees

        df_tmp["calc1"] = df_tmp["global_mx"]*cos(p) + df_tmp["global_my"]*sin(r)*sin(p) + df_tmp["global_mz"]*sin(p)*cos(r)
        df_tmp["calc2"] = df_tmp["global_mz"]*sin(r) - df_tmp["global_my"]*cos(r)
        df_tmp["calc_deg"] = df_tmp.apply(calc3, axis=1)
        output_df = output_df.append(df_tmp)
    return output_df

In [14]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# データ読み込み

In [15]:
accel_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalAccel.csv')
mag_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalMag.csv')
accel_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalAccel.csv')
mag_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalMag.csv')
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')

ground_truth = ground_truth.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
gt = ground_truth[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg_gt', 'lngDeg_gt', 'speedMps', 'courseDegree']].copy()
train = train.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')

In [16]:
train = train[train['collectionName'].isin(['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2'])].copy()
accel_train = accel_train[accel_train['collectionName'].isin(['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2'])].copy()
mag_train = mag_train[mag_train['collectionName'].isin(['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2'])].copy()

# IMUからdegを算出

In [17]:
train = calc_degree_by_imu(train, accel_train, mag_train)
test = calc_degree_by_imu(test, accel_test, mag_test)

# 座標移動からdegを算出

In [18]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train = calc_degree_by_gt(train)

In [19]:
train

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,latDeg_gt,lngDeg_gt,speedMps,...,x_f,y_f,z_f,global_mx,global_my,global_mz,calc1,calc2,calc_deg,deg
0,2021-04-22-US-SJC-1,Pixel4,1303160575446,37.334582,-121.899408,-10.66,2021-04-22-US-SJC-1_Pixel4,37.334581,-121.899447,0.0,...,-0.883052,-1.092551,9.623801,17.365500,-8.428595,-13.403579,18.417843,9.885714,28.224527,
1,2021-04-22-US-SJC-1,Pixel4,1303160576446,37.334603,-121.899416,-6.76,2021-04-22-US-SJC-1_Pixel4,37.334581,-121.899447,0.0,...,-0.884626,-1.093956,9.636221,17.372530,-8.406914,-13.409430,18.425594,9.864831,28.164035,
2,2021-04-22-US-SJC-1,Pixel4,1303160577446,37.334579,-121.899418,-10.73,2021-04-22-US-SJC-1_Pixel4,37.334581,-121.899447,0.0,...,-0.886610,-1.095938,9.653419,16.811996,-8.394304,-13.394147,17.866129,9.850578,28.870361,
3,2021-04-22-US-SJC-1,Pixel4,1303160578446,37.334595,-121.899419,-6.35,2021-04-22-US-SJC-1_Pixel4,37.334581,-121.899447,0.0,...,-0.887805,-1.097101,9.666013,16.556716,-8.393764,-13.389936,17.611532,9.849568,29.216892,
4,2021-04-22-US-SJC-1,Pixel4,1303160579446,37.334590,-121.899406,-19.56,2021-04-22-US-SJC-1_Pixel4,37.334581,-121.899447,0.0,...,-0.888977,-1.097872,9.675446,16.458759,-8.395289,-13.388312,17.513818,9.850899,29.356205,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14508,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,-8.09,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334475,-121.899613,0.0,...,-1.672781,-1.526907,9.521529,21.327522,-14.066301,-16.699575,23.334331,16.524637,35.304922,
14509,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,-7.59,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334475,-121.899613,0.0,...,,,,21.285161,-13.987993,-17.042068,23.348543,16.501340,35.250362,
14510,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,-8.35,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334475,-121.899613,0.0,...,,,,21.284410,-13.937363,-17.277281,23.386202,16.488450,35.185764,176.524924
14511,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,-8.73,2021-04-29-US-SJC-2_SamsungS20Ultra,37.334475,-121.899613,0.0,...,,,,21.293950,-13.877580,-17.416249,23.419060,16.451340,35.087141,


In [20]:
train_calc_deg = train[['phone', 'millisSinceGpsEpoch', 'x_f', 'y_f', 'z_f', 'global_mx', 'global_my', 'global_mz', 'calc_deg', 'courseDegree']].copy()
test_calc_deg = test[['phone', 'millisSinceGpsEpoch', 'x_f', 'y_f', 'z_f', 'global_mx', 'global_my', 'global_mz', 'calc_deg']].copy()

In [21]:
train = train[train['speedMps']>0].copy()

# ML

In [22]:
def add_features(df):
    for c,i in itertools.product(['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM', 'calc_deg'], [1,2,3,4,5,-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        df[col] = df[c].shift(i)
        df[col+'_diff'] = df[c] - df[col]
        df.loc[df['phone']!=df['phone'].shift(i), [col, col+'_diff']] = np.nan
    
    for c in ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM', 'calc_deg']:
        df[c+'_s1_diff_sum'] = df[c+'_s1_diff'].fillna(0) + df[c+'_s-1_diff'].fillna(0)
        df[c+'_s2_diff_sum'] = df[c+'_s1_diff_sum'] + df[c+'_s2_diff'].fillna(0) + df[c+'_s-2_diff'].fillna(0)
        df[c+'_s3_diff_sum'] = df[c+'_s2_diff_sum'] + df[c+'_s3_diff'].fillna(0) + df[c+'_s-3_diff'].fillna(0)
        df[c+'_s4_diff_sum'] = df[c+'_s3_diff_sum'] + df[c+'_s4_diff'].fillna(0) + df[c+'_s-4_diff'].fillna(0)
        df[c+'_s5_diff_sum'] = df[c+'_s4_diff_sum'] + df[c+'_s5_diff'].fillna(0) + df[c+'_s-5_diff'].fillna(0)
    
    return df

In [23]:
def add_sensor_features(df, accel, gyro, mag, ori):
    # phoneを追加
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    accel['phone'] = accel['collectionName'] + '_' + accel['phoneName']
    gyro['phone'] = gyro['collectionName'] + '_' + gyro['phoneName']
    mag['phone'] = mag['collectionName'] + '_' + mag['phoneName']
    ori['phone'] = ori['collectionName'] + '_' + ori['phoneName']
    
     # 一定の値しか入っていないphoneを除外しておく
    ori = ori[~ori['phone'].isin(['2021-04-29-US-MTV-1_SamsungS20Ultra', '2021-04-28-US-MTV-1_SamsungS20Ultra', '2021-04-28-US-SJC-1_SamsungS20Ultra', '2021-04-29-US-SJC-2_SamsungS20Ultra',
                                 '2021-04-28-US-MTV-2_SamsungS20Ultra', '2021-04-29-US-SJC-3_SamsungS20Ultra', '2021-04-29-US-MTV-2_SamsungS20Ultra'])]
    
    
    # phonenameをラベルエンコーディング
    phoneName_map = {'Pixel4':1, 'Pixel4XLModded':2, 'Pixel4XL':3, 'Mi8':4, 'Pixel4Modded':5, 'Pixel5':6, 'SamsungS20Ultra':7}
    df['phoneName_le'] = df['phoneName'].map(phoneName_map)
    
    # utc -> gps
    accel['millisSinceGpsEpoch'] = accel['utcTimeMillis'] - 315964800000 + 18000
    gyro['millisSinceGpsEpoch'] = gyro['utcTimeMillis'] - 315964800000 + 18000
    mag['millisSinceGpsEpoch'] = mag['utcTimeMillis'] - 315964800000 + 18000
    ori['millisSinceGpsEpoch'] = ori['utcTimeMillis'] - 315964800000 + 18000
    
    # resampling追加
    df['secondSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    accel['secondSinceGpsEpoch'] = accel['millisSinceGpsEpoch'] // 1000
    gyro['secondSinceGpsEpoch'] = gyro['millisSinceGpsEpoch'] // 1000
    mag['secondSinceGpsEpoch'] = mag['millisSinceGpsEpoch'] // 1000
    ori['secondSinceGpsEpoch'] = ori['millisSinceGpsEpoch'] // 1000
    
    # clipping
    accel[['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2']] = accel.groupby('phone')['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    gyro[['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec']] = gyro.groupby('phone')['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    mag[['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT']] = mag.groupby('phone')['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    
    accel = accel.groupby(['phone', 'secondSinceGpsEpoch'])['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].agg(['mean', 'std']).reset_index()
    accel.columns = ['phone', 'secondSinceGpsEpoch', 'UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std']
    gyro = gyro.groupby(['phone', 'secondSinceGpsEpoch'])['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].agg(['mean', 'std']).reset_index()
    gyro.columns = ['phone', 'secondSinceGpsEpoch', 'UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std' ]
    mag = mag.groupby(['phone', 'secondSinceGpsEpoch'])['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].agg(['mean', 'std']).reset_index()
    mag.columns = ['phone', 'secondSinceGpsEpoch', 'UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std']
    ori = ori.groupby(['phone', 'secondSinceGpsEpoch'])['yawDeg', 'rollDeg', 'pitchDeg'].agg(['mean', 'std']).reset_index()
    ori.columns = ['phone', 'secondSinceGpsEpoch', 'yawDeg_mean', 'yawDeg_std', 'rollDeg_mean', 'rollDeg_std', 'pitchDeg_mean', 'pitchDeg_std']
    
    
    # shift特徴量
    for c, i in itertools.product(['UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        accel[col] = accel[c].shift(i)
        accel[col+'_diff'] = accel[c] - accel[col]
        accel.loc[accel['phone']!=accel['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        gyro[col] = gyro[c].shift(i)
        gyro[col+'_diff'] = gyro[c] - gyro[col]
        gyro.loc[gyro['phone']!=gyro['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        mag[col] = mag[c].shift(i)
        mag[col+'_diff'] = mag[c] - mag[col]
        mag.loc[mag['phone']!=mag['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['yawDeg_mean', 'yawDeg_std', 'rollDeg_mean', 'rollDeg_std', 'pitchDeg_mean', 'pitchDeg_std'], [1,2,3,-1,-2,-3]):
        col = c+ '_s' + str(i)
        ori[col] = ori[c].shift(i)
        ori[col+'_diff'] = ori[c] - ori[col]
        ori.loc[ori['phone']!=ori['phone'].shift(i), [col, col+'_diff']] = np.nan        
    
    df = df.merge(accel, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(gyro, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(mag, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(ori, on=['phone', 'secondSinceGpsEpoch'], how='left')
    
    df.drop(['secondSinceGpsEpoch'], axis=1, inplace=True)
    
    return df

In [24]:
train, test, sub, gt = get_data()
accel_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalAccel.csv')
gyro_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalGyro.csv')
mag_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalMag.csv')
ori_train = pd.read_csv(INPUT + '/prep/gnss/train/OrientationDeg.csv')
#accel_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalAccel.csv')
#gyro_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalGyro.csv')
#mag_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalMag.csv')
#ori_test = pd.read_csv(INPUT + '/prep/gnss/test/OrientationDeg.csv')

In [25]:
train = train.merge(train_calc_deg, on=['phone', 'millisSinceGpsEpoch'], how='inner')
#test = test.merge(test_calc_deg, on='phone', how='left')
train = add_features(train)
train = add_sensor_features(train, accel_train, gyro_train, mag_train, ori_train)
#test = add_features(test)
#test = add_sensor_features(test, accel_test, gyro_test, mag_test, ori_test)

In [26]:
# calc_degの結果があまりに悪いものは除外

reject_phone = []

# lgb_tuner

In [27]:
target1 = 'courseDegree'
not_use_cols = ['speedMps', 'courseDegree', 'collectionName', 'phoneName', 'phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM',
                'latDeg_s1', 'latDeg_s2', 'latDeg_s3', 'latDeg_s-1', 'latDeg_s-2',
                'latDeg_s-3', 'lngDeg_s1', 'lngDeg_s2', 'lngDeg_s3', 'lngDeg_s-1',
                'lngDeg_s-2', 'lngDeg_s-3', 'heightAboveWgs84EllipsoidM_s1',
                'heightAboveWgs84EllipsoidM_s2', 'heightAboveWgs84EllipsoidM_s3',
                'heightAboveWgs84EllipsoidM_s-1', 'heightAboveWgs84EllipsoidM_s-2',
                'heightAboveWgs84EllipsoidM_s-3', target1]

features = [c for c in train.columns if c not in not_use_cols]

opt_params ={'objective': 'regression',
 'metric': 'rmse',
 'learning_rate': 0.1,
 'seed': 42,
 'feature_pre_filter': False,
 'lambda_l1': 5.842025753503934e-07,
 'lambda_l2': 8.524849124678289,
 'num_leaves': 2,
 'feature_fraction': 0.6,
 'bagging_fraction': 0.6666246786476029,
 'bagging_freq': 7,
 'min_child_samples': 100,
 'num_iterations': 20000,
 'early_stopping_round': 100}

In [29]:
collections = ['2021-04-22-US-SJC-1', '2021-04-29-US-SJC-2', '2021-04-28-US-SJC-1']

oof1 = pd.DataFrame()
imp1 = pd.DataFrame()
test_preds = np.zeros(len(test))
n = len(collections)

for collection in collections:
    print('valid : ', collection)
    tr_idx = train[train['collectionName']!=collection].index
    vl_idx = train[train['collectionName']==collection].index
    tr_x, tr_y = train[features].iloc[tr_idx], train[target1].iloc[tr_idx]
    vl_x, vl_y = train[features].iloc[vl_idx], train[target1].iloc[vl_idx]
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)
    
    model = lgb.train(opt_params, tr_data, valid_sets=[tr_data, vl_data],
                      num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)
    vl_pred = model.predict(vl_x, num_iteration=model.best_iteration)
    
    oof_tmp = train.iloc[vl_idx].copy()
    oof_tmp['pred'] = vl_pred
    oof1 = oof1.append(oof_tmp)
    
    imp_tmp = pd.DataFrame()
    imp_tmp['feature'] = model.feature_name()
    imp_tmp['importance'] = model.feature_importance()
    imp_tmp['valid_collection'] = collection
    imp1 = imp1.append(imp_tmp)
    
    #pred = model.predict(test[features], num_iteration=model.best_iteration)
    #test_preds += pred / n
#test['pred'] = test_preds

valid :  2021-04-22-US-SJC-1
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120618
[LightGBM] [Info] Number of data points in the train set: 8797, number of used features: 474
[LightGBM] [Info] Start training from score 191.415975
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 10.2088	valid_1's rmse: 193.112
Early stopping, best iteration is:
[1]	training's rmse: 98.1656	valid_1's rmse: 118.959
valid :  2021-04-29-US-SJC-2
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120618
[LightGBM] [Info] Number of data points in the train set: 9813, number of used features: 474
[LightGBM] [Info] Start training from score 214.910233
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 63.6026	valid_1's rmse: 79.3467
[200]	training's rmse: 54.9681	valid_1's rmse: 78.2564
[300]	training's rmse: 49.9	valid_1's rmse: 76.4254
[400]	training's rmse: 46.480

# 可視化・評価

In [30]:
for phone in oof1['phone'].unique():

    fig, axes = plt.subplots(figsize=(30, 10), nrows=1,sharex=True)
    tmp = oof1[oof1['phone']==phone].copy()
    
    axes.plot(tmp['millisSinceGpsEpoch'], tmp['courseDegree'], label='gt')
    axes.plot(tmp['millisSinceGpsEpoch'], tmp['calc_deg'], label='calc')
    axes.plot(tmp['millisSinceGpsEpoch'], tmp['pred'], label='pred')
    axes.legend(loc='upper right')
    axes.grid(color='g', linestyle=':', linewidth=0.3)

    fig.suptitle(phone, fontsize=16)
    fig.savefig(OUTPUT + '/' + phone + '.png')
    plt.close()