# exp051
deg計算にセンサ＋baselineの特徴量加えて機械学習で予測

In [1]:
import os
import pandas as pd
import numpy as np
import ipynb_path
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
import plotly
import plotly.express as px
%matplotlib inline
pd.set_option('display.max_rows', 200)
from math import * 
import warnings
warnings.simplefilter('ignore')
import pathlib
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score

In [2]:
def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [4]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = get_ground_truth()
    return base_train, base_test, sample_sub, ground_truth

In [5]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=1000,
                            width=2000)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()

In [6]:
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [7]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [8]:
# lowpass filter

from scipy.signal import butter, lfilter

def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

order = 3
fs = 50.0
cutoff = 2.5

In [9]:
# Offset correction
# refarence https://github.com/J-ROCKET-BOY/SS-Fitting

def SS_fit(data) : 

    x = data[:,[0]]
    y = data[:,[1]]
    z = data[:,[2]]

    data_len = len(x)
    
    x2 = np.power(x,2)
    y2 = np.power(y,2)
    z2 = np.power(z,2)

    r1 = -x*(x2+y2+z2)
    r2= -y*(x2+y2+z2)
    r3 = -z*(x2+y2+z2)
    r4 = -(x2+y2+z2)

    left = np.array([[np.sum(x2),np.sum(x*y),np.sum(x*z),np.sum(x)],
                     [np.sum(x*y),np.sum(y2),np.sum(y*z),np.sum(y)],
                     [np.sum(x*z),np.sum(y*z),np.sum(z2),np.sum(z)],
                     [np.sum(x), np.sum(y), np.sum(z), data_len]])
    
    right = np.array([np.sum(r1),
                      np.sum(r2),
                      np.sum(r3),
                      np.sum(r4)])
    
    si = np.dot(np.linalg.inv(left),right)

    x0 = (-1/2)* si[0]
    y0 = (-1/2)* si[1]
    z0 = (-1/2)* si[2]
    
    return np.array([x0,y0,z0])

In [10]:
# Vincenty's formulae
# refarence https://qiita.com/r-fuji/items/99ca549b963cedc106ab

def vincenty_inverse(lat1, lon1, lat2, lon2):

    # Not advanced
    if isclose(lat1, lat2) and isclose(lon1, lon2):
        return False
    
    # WGS84
    a = 6378137.0
    ƒ = 1 / 298.257223563
    b = (1 - ƒ) * a

    lat_1 = atan((1 - ƒ) * tan(radians(lat1)))
    lat_2 = atan((1 - ƒ) * tan(radians(lat2)))
    
    lon_diff = radians(lon2) - radians(lon1)
    λ = lon_diff

    for i in range(1000):
        sinλ = sin(λ)
        cosλ = cos(λ)
        sinσ = sqrt((cos(lat_2) * sinλ) ** 2 + (cos(lat_1) * sin(lat_2) - sin(lat_1) * cos(lat_2) * cosλ) ** 2)
        cosσ = sin(lat_1) * sin(lat_2) + cos(lat_1) * cos(lat_2) * cosλ
        σ = atan2(sinσ, cosσ)
        sinα = cos(lat_1) * cos(lat_2) * sinλ / sinσ
        cos2α = 1 - sinα ** 2
        cos2σm = cosσ - 2 * sin(lat_1) * sin(lat_2) / cos2α
        C = ƒ / 16 * cos2α * (4 + ƒ * (4 - 3 * cos2α))
        λʹ = λ
        λ = lon_diff + (1 - C) * ƒ * sinα * (σ + C * sinσ * (cos2σm + C * cosσ * (-1 + 2 * cos2σm ** 2)))
        
        if abs(λ - λʹ) <= 1e-12:
            break
    else:
        return None

    α = atan2(cos(lat_2) * sinλ, cos(lat_1) * sin(lat_2) - sin(lat_1) * cos(lat_2) * cosλ)

    if α < 0:
        α = α + pi * 2

    return degrees(α)

In [11]:
def calc3(row):
    deg = - degrees(atan2(-1*row['calc2'],row['calc1']))
    if deg < 0:
        deg += 360
    return deg 

In [12]:
def calc_degree_by_gt(df):
    phones = df['phone'].unique()
    df['deg'] = np.nan
    
    for idx in range(len(df)-1):
        if df.at[idx, 'phone'] != df.at[idx+1, 'phone']:
            continue
            
        lat = df.at[idx, 'latDeg_gt']
        lng = df.at[idx, 'lngDeg_gt']
        lat_next = df.at[idx+1, 'latDeg_gt']
        lng_next = df.at[idx+1, 'lngDeg_gt']
        
        res = vincenty_inverse(lat, lng, lat_next, lng_next)
        if res:
            df.at[idx, 'deg'] = res
    
    return df

In [13]:
def calc_degree_by_imu(df, accel, mag):
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    accel['phone'] = accel['collectionName'] + '_' + accel['phoneName']
    mag['phone'] = mag['collectionName'] + '_' + mag['phoneName']
    
    # utc -> gps
    accel['millisSinceGpsEpoch'] = accel['utcTimeMillis'] - 315964800000 + 18000
    mag['millisSinceGpsEpoch'] = mag['utcTimeMillis'] - 315964800000 + 18000
    
    # resampling追加
    df['secondSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    accel['secondSinceGpsEpoch'] = accel['millisSinceGpsEpoch'] // 1000
    mag['secondSinceGpsEpoch'] = mag['millisSinceGpsEpoch'] // 1000
    
    # clipping
    accel[['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2']] = accel.groupby('phone')['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    mag[['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT']] = mag.groupby('phone')['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    
    #     acce filtering and smooting
    accel["global_x"] = accel["UncalAccelZMps2"]
    accel["global_y"] = accel["UncalAccelXMps2"]
    accel["global_z"] = accel["UncalAccelYMps2"]
    accel["x_f"] = butter_lowpass_filter(accel["global_x"], cutoff, fs, order)
    accel["y_f"] = butter_lowpass_filter(accel["global_y"], cutoff, fs, order)
    accel["z_f"] = butter_lowpass_filter(accel["global_z"], cutoff, fs, order)
    
    mag["global_mx"] = mag["UncalMagZMicroT"]
    mag["global_my"] = mag["UncalMagYMicroT"]
    mag["global_mz"] = mag["UncalMagXMicroT"]

    
    output_df = pd.DataFrame()
    for phone in mag['phone'].unique():
        df_tmp = df[df['phone']==phone]
        accel_tmp = accel[accel['phone']==phone].copy()
        mag_tmp = mag[mag['phone']==phone].copy()
    
        smooth_range = 1000
        accel_tmp["x_f"] = accel_tmp.groupby('phone')["x_f"].rolling(smooth_range, center=True, min_periods=1).mean().values
        accel_tmp["y_f"] = accel_tmp.groupby('phone')["y_f"].rolling(smooth_range, center=True, min_periods=1).mean().values
        accel_tmp["z_f"] = accel_tmp.groupby('phone')["z_f"].rolling(smooth_range, center=True, min_periods=1).mean().values

        mag_tmp["global_mx"] = mag_tmp.groupby('phone')["global_mx"].rolling(smooth_range,  min_periods=1).mean().values
        mag_tmp["global_my"] = mag_tmp.groupby('phone')["global_mz"].rolling(smooth_range,  min_periods=1).mean().values
        mag_tmp["global_mz"] = mag_tmp.groupby('phone')["global_my"].rolling(smooth_range,  min_periods=1).mean().values

        offset = SS_fit(np.array(mag_tmp[["global_mx","global_my","global_mz"]]))
        mag_tmp["global_mx"] = (mag_tmp["global_mx"] - offset[0])*-1
        mag_tmp["global_my"] = mag_tmp["global_my"] - offset[1]
        mag_tmp["global_mz"] = mag_tmp["global_mz"] - offset[2]
        
        accel_tmp = accel_tmp.groupby(['phone', 'secondSinceGpsEpoch'])['x_f', 'y_f', 'z_f'].mean().reset_index()
        accel_tmp.columns = ['phone', 'secondSinceGpsEpoch', 'x_f', 'y_f', 'z_f']
    
        mag_tmp = mag_tmp.groupby(['phone', 'secondSinceGpsEpoch'])['global_mx', 'global_my', 'global_mz'].mean().reset_index()
        mag_tmp.columns = ['phone', 'secondSinceGpsEpoch', 'global_mx', 'global_my', 'global_mz']    

        df_tmp = df_tmp.merge(accel_tmp, on=['phone', 'secondSinceGpsEpoch'], how='left')
        df_tmp = df_tmp.merge(mag_tmp, on=['phone', 'secondSinceGpsEpoch'], how='left')
        
        start_mean_range = 10
        x_start_mean = df_tmp[:start_mean_range]["x_f"].mean()
        y_start_mean = df_tmp[:start_mean_range]["y_f"].mean()
        z_start_mean = df_tmp[:start_mean_range]["z_f"].mean() 

        #     roll and picth, device tilt
        r = atan(y_start_mean/z_start_mean)
        p = atan(x_start_mean/(y_start_mean**2 + z_start_mean**2)**0.5)

    #     calculation　degrees

        df_tmp["calc1"] = df_tmp["global_mx"]*cos(p) + df_tmp["global_my"]*sin(r)*sin(p) + df_tmp["global_mz"]*sin(p)*cos(r)
        df_tmp["calc2"] = df_tmp["global_mz"]*sin(r) - df_tmp["global_my"]*cos(r)
        df_tmp["calc_deg"] = df_tmp.apply(calc3, axis=1)
        output_df = output_df.append(df_tmp)
    return output_df

In [14]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# データ読み込み

In [15]:
accel_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalAccel.csv')
mag_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalMag.csv')
accel_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalAccel.csv')
mag_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalMag.csv')
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')

ground_truth = ground_truth.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
gt = ground_truth[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg_gt', 'lngDeg_gt', 'speedMps', 'courseDegree']].copy()
train = train.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')

# IMUからdegを算出

In [16]:
train = calc_degree_by_imu(train, accel_train, mag_train)
test = calc_degree_by_imu(test, accel_test, mag_test)

# 座標移動からdegを算出

In [17]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train = calc_degree_by_gt(train)

In [18]:
train

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,latDeg_gt,lngDeg_gt,speedMps,...,x_f,y_f,z_f,global_mx,global_my,global_mz,calc1,calc2,calc_deg,deg
0,2020-05-21-US-MTV-2,Pixel4,1274131364434,37.628920,-122.426283,-6.52,2020-05-21-US-MTV-2_Pixel4,37.628927,-122.426295,0.0,...,0.743162,-0.648938,9.677418,-21.300935,-2.563242,-6.308718,-21.715691,2.966373,172.221514,
1,2020-05-21-US-MTV-2,Pixel4,1274131365434,37.628876,-122.426267,1.82,2020-05-21-US-MTV-2_Pixel4,37.628927,-122.426295,0.0,...,0.756263,-0.629775,9.756054,-22.126694,-2.557740,-6.296307,-22.537969,2.960079,172.517745,
2,2020-05-21-US-MTV-2,Pixel4,1274131366434,37.628894,-122.426274,-0.82,2020-05-21-US-MTV-2_Pixel4,37.628927,-122.426295,0.0,...,0.751826,-0.629631,9.755231,-22.311710,-2.595542,-6.302302,-22.722693,2.998190,172.483416,
3,2020-05-21-US-MTV-2,Pixel4,1274131367434,37.628883,-122.426266,1.03,2020-05-21-US-MTV-2_Pixel4,37.628927,-122.426295,0.0,...,0.747335,-0.630461,9.754735,-22.513921,-2.631065,-6.313635,-22.924988,3.034373,172.460105,
4,2020-05-21-US-MTV-2,Pixel4,1274131368434,37.628909,-122.426295,-3.86,2020-05-21-US-MTV-2_Pixel4,37.628927,-122.426295,0.0,...,0.747015,-0.631099,9.755063,-22.456317,-2.654004,-6.327174,-22.868504,3.058140,172.383188,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122070,2020-05-29-US-MTV-2,Pixel4XL,1274832059447,37.416390,-122.078071,-26.83,2020-05-29-US-MTV-2_Pixel4XL,37.415923,-122.080712,0.0,...,,,,8.396728,23.438402,15.720410,11.400012,-22.770169,296.595093,
122071,2020-05-29-US-MTV-2,Pixel4XL,1274832060447,37.416390,-122.078071,-26.83,2020-05-29-US-MTV-2_Pixel4XL,37.415923,-122.080712,0.0,...,,,,8.458121,23.400291,15.718915,11.459717,-22.732151,296.753556,
122072,2020-05-29-US-MTV-2,Pixel4XL,1274832061447,37.416390,-122.078071,-26.83,2020-05-29-US-MTV-2_Pixel4XL,37.415923,-122.080712,0.0,...,,,,8.536251,23.342974,15.712876,11.534846,-22.675132,296.962452,
122073,2020-05-29-US-MTV-2,Pixel4XL,1274832062447,37.416390,-122.078071,-26.83,2020-05-29-US-MTV-2_Pixel4XL,37.415923,-122.080712,0.0,...,,,,8.650278,23.247316,15.701961,11.644003,-22.580006,297.279145,


In [19]:
train_calc_deg = train[['phone', 'millisSinceGpsEpoch', 'x_f', 'y_f', 'z_f', 'global_mx', 'global_my', 'global_mz', 'calc_deg', 'courseDegree']].copy()
test_calc_deg = test[['phone', 'millisSinceGpsEpoch', 'x_f', 'y_f', 'z_f', 'global_mx', 'global_my', 'global_mz', 'calc_deg']].copy()

In [34]:
# calc_degがあまりに悪いものは除外しておく
from sklearn.metrics import mean_squared_error
train_calc_deg = train_calc_deg.dropna(subset=['calc_deg'])
print('total : ', np.sqrt(mean_squared_error(train_calc_deg['courseDegree'], train_calc_deg['calc_deg'])))
phones = train_calc_deg['phone'].unique()
for phone in phones:
    tmp = train_calc_deg[train_calc_deg['phone']==phone].copy()
    print(phone + ' : ', np.sqrt(mean_squared_error(tmp['courseDegree'], tmp['calc_deg'])))

total :  84.5024899146548
2020-05-21-US-MTV-2_Pixel4 :  87.78717058376309
2020-05-21-US-MTV-2_Pixel4XL :  47.553805968522695
2020-06-11-US-MTV-1_Pixel4 :  53.490707520070636
2020-06-11-US-MTV-1_Pixel4XL :  37.38967507227319
2020-06-05-US-MTV-2_Pixel4 :  82.82281181335607
2020-06-05-US-MTV-2_Pixel4XL :  75.52969852883378
2020-07-17-US-MTV-1_Mi8 :  122.12343774367895
2021-04-22-US-SJC-1_Pixel4 :  197.61306859900571
2021-04-22-US-SJC-1_SamsungS20Ultra :  192.61064181914512
2020-05-14-US-MTV-2_Pixel4 :  37.986392947527335
2020-05-14-US-MTV-2_Pixel4XLModded :  165.42007293534763
2021-01-05-US-SVL-1_Pixel5 :  98.79141043277872
2021-01-05-US-SVL-1_Pixel4 :  97.3379811708587
2021-01-05-US-SVL-1_Pixel4XL :  95.50597771739793
2021-01-05-US-SVL-1_Mi8 :  52.551869475509335
2020-06-04-US-MTV-1_Pixel4 :  68.83339563474938
2020-06-04-US-MTV-1_Pixel4XL :  15.096926576958422
2020-06-04-US-MTV-1_Pixel4XLModded :  183.20118569355273
2021-01-05-US-SVL-2_Pixel4Modded :  107.99298283867732
2021-01-05-US-SVL

# ML

In [20]:
def add_features(df):
    for c,i in itertools.product(['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM', 'calc_deg'], [1,2,3,4,5,-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        df[col] = df[c].shift(i)
        df[col+'_diff'] = df[c] - df[col]
        df.loc[df['phone']!=df['phone'].shift(i), [col, col+'_diff']] = np.nan
    
    for c in ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM', 'calc_deg']:
        df[c+'_s1_diff_sum'] = df[c+'_s1_diff'].fillna(0) + df[c+'_s-1_diff'].fillna(0)
        df[c+'_s2_diff_sum'] = df[c+'_s1_diff_sum'] + df[c+'_s2_diff'].fillna(0) + df[c+'_s-2_diff'].fillna(0)
        df[c+'_s3_diff_sum'] = df[c+'_s2_diff_sum'] + df[c+'_s3_diff'].fillna(0) + df[c+'_s-3_diff'].fillna(0)
        df[c+'_s4_diff_sum'] = df[c+'_s3_diff_sum'] + df[c+'_s4_diff'].fillna(0) + df[c+'_s-4_diff'].fillna(0)
        df[c+'_s5_diff_sum'] = df[c+'_s4_diff_sum'] + df[c+'_s5_diff'].fillna(0) + df[c+'_s-5_diff'].fillna(0)
    
    return df

In [21]:
def add_sensor_features(df, accel, gyro, mag, ori):
    # phoneを追加
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    accel['phone'] = accel['collectionName'] + '_' + accel['phoneName']
    gyro['phone'] = gyro['collectionName'] + '_' + gyro['phoneName']
    mag['phone'] = mag['collectionName'] + '_' + mag['phoneName']
    ori['phone'] = ori['collectionName'] + '_' + ori['phoneName']
    
     # 一定の値しか入っていないphoneを除外しておく
    ori = ori[~ori['phone'].isin(['2021-04-29-US-MTV-1_SamsungS20Ultra', '2021-04-28-US-MTV-1_SamsungS20Ultra', '2021-04-28-US-SJC-1_SamsungS20Ultra', '2021-04-29-US-SJC-2_SamsungS20Ultra',
                                 '2021-04-28-US-MTV-2_SamsungS20Ultra', '2021-04-29-US-SJC-3_SamsungS20Ultra', '2021-04-29-US-MTV-2_SamsungS20Ultra'])]
    
    
    # phonenameをラベルエンコーディング
    phoneName_map = {'Pixel4':1, 'Pixel4XLModded':2, 'Pixel4XL':3, 'Mi8':4, 'Pixel4Modded':5, 'Pixel5':6, 'SamsungS20Ultra':7}
    df['phoneName_le'] = df['phoneName'].map(phoneName_map)
    
    # utc -> gps
    accel['millisSinceGpsEpoch'] = accel['utcTimeMillis'] - 315964800000 + 18000
    gyro['millisSinceGpsEpoch'] = gyro['utcTimeMillis'] - 315964800000 + 18000
    mag['millisSinceGpsEpoch'] = mag['utcTimeMillis'] - 315964800000 + 18000
    ori['millisSinceGpsEpoch'] = ori['utcTimeMillis'] - 315964800000 + 18000
    
    # resampling追加
    df['secondSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    accel['secondSinceGpsEpoch'] = accel['millisSinceGpsEpoch'] // 1000
    gyro['secondSinceGpsEpoch'] = gyro['millisSinceGpsEpoch'] // 1000
    mag['secondSinceGpsEpoch'] = mag['millisSinceGpsEpoch'] // 1000
    ori['secondSinceGpsEpoch'] = ori['millisSinceGpsEpoch'] // 1000
    
    # clipping
    accel[['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2']] = accel.groupby('phone')['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    gyro[['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec']] = gyro.groupby('phone')['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    mag[['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT']] = mag.groupby('phone')['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    
    accel = accel.groupby(['phone', 'secondSinceGpsEpoch'])['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].agg(['mean', 'std']).reset_index()
    accel.columns = ['phone', 'secondSinceGpsEpoch', 'UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std']
    gyro = gyro.groupby(['phone', 'secondSinceGpsEpoch'])['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].agg(['mean', 'std']).reset_index()
    gyro.columns = ['phone', 'secondSinceGpsEpoch', 'UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std' ]
    mag = mag.groupby(['phone', 'secondSinceGpsEpoch'])['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].agg(['mean', 'std']).reset_index()
    mag.columns = ['phone', 'secondSinceGpsEpoch', 'UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std']
    ori = ori.groupby(['phone', 'secondSinceGpsEpoch'])['yawDeg', 'rollDeg', 'pitchDeg'].agg(['mean', 'std']).reset_index()
    ori.columns = ['phone', 'secondSinceGpsEpoch', 'yawDeg_mean', 'yawDeg_std', 'rollDeg_mean', 'rollDeg_std', 'pitchDeg_mean', 'pitchDeg_std']
    
    
    # shift特徴量
    for c, i in itertools.product(['UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        accel[col] = accel[c].shift(i)
        accel[col+'_diff'] = accel[c] - accel[col]
        accel.loc[accel['phone']!=accel['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        gyro[col] = gyro[c].shift(i)
        gyro[col+'_diff'] = gyro[c] - gyro[col]
        gyro.loc[gyro['phone']!=gyro['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std'], [1,2,3,4,5-1,-2,-3,-4,-5]):
        col = c+ '_s' + str(i)
        mag[col] = mag[c].shift(i)
        mag[col+'_diff'] = mag[c] - mag[col]
        mag.loc[mag['phone']!=mag['phone'].shift(i), [col, col+'_diff']] = np.nan
    for c, i in itertools.product(['yawDeg_mean', 'yawDeg_std', 'rollDeg_mean', 'rollDeg_std', 'pitchDeg_mean', 'pitchDeg_std'], [1,2,3,-1,-2,-3]):
        col = c+ '_s' + str(i)
        ori[col] = ori[c].shift(i)
        ori[col+'_diff'] = ori[c] - ori[col]
        ori.loc[ori['phone']!=ori['phone'].shift(i), [col, col+'_diff']] = np.nan        
    
    df = df.merge(accel, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(gyro, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(mag, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(ori, on=['phone', 'secondSinceGpsEpoch'], how='left')
    
    df.drop(['secondSinceGpsEpoch'], axis=1, inplace=True)
    
    return df

In [22]:
train, test, sub, gt = get_data()
accel_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalAccel.csv')
gyro_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalGyro.csv')
mag_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalMag.csv')
ori_train = pd.read_csv(INPUT + '/prep/gnss/train/OrientationDeg.csv')
#accel_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalAccel.csv')
#gyro_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalGyro.csv')
#mag_test = pd.read_csv(INPUT + '/prep/gnss/test/UncalMag.csv')
#ori_test = pd.read_csv(INPUT + '/prep/gnss/test/OrientationDeg.csv')

In [23]:
train = train.merge(train_calc_deg, on=['phone', 'millisSinceGpsEpoch'], how='inner')
#test = test.merge(test_calc_deg, on='phone', how='left')
train = add_features(train)
train = add_sensor_features(train, accel_train, gyro_train, mag_train, ori_train)
#test = add_features(test)
#test = add_sensor_features(test, accel_test, gyro_test, mag_test, ori_test)

KeyError: 'calc_deg'

In [None]:
# calc_degの結果があまりに悪いものは除外

reject_phone = []

# lgb_tuner

In [24]:
target1 = 'courseDegree'
not_use_cols = ['speedMps', 'courseDegree', 'collectionName', 'phoneName', 'phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM',
                'latDeg_s1', 'latDeg_s2', 'latDeg_s3', 'latDeg_s-1', 'latDeg_s-2',
                'latDeg_s-3', 'lngDeg_s1', 'lngDeg_s2', 'lngDeg_s3', 'lngDeg_s-1',
                'lngDeg_s-2', 'lngDeg_s-3', 'heightAboveWgs84EllipsoidM_s1',
                'heightAboveWgs84EllipsoidM_s2', 'heightAboveWgs84EllipsoidM_s3',
                'heightAboveWgs84EllipsoidM_s-1', 'heightAboveWgs84EllipsoidM_s-2',
                'heightAboveWgs84EllipsoidM_s-3', target1]

features = [c for c in train.columns if c not in not_use_cols]

params = {'objective': 'regression',
          'metric': 'rmse',
          'learning_rate': 0.1, 
          'seed': 42}

In [25]:
collections = ['2021-04-22-US-SJC-1', '2021-04-29-US-SJC-2', '2021-04-28-US-SJC-1']

oof1 = pd.DataFrame()
imp1 = pd.DataFrame()
test_preds = np.zeros(len(test))
n = len(collections)

for collection in collections:
    print('valid : ', collection)
    tr_idx = train[train['collectionName']!=collection].index
    vl_idx = train[train['collectionName']==collection].index
    tr_x, tr_y = train[features].iloc[tr_idx], train[target1].iloc[tr_idx]
    vl_x, vl_y = train[features].iloc[vl_idx], train[target1].iloc[vl_idx]
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)
    break
    
study = optuna_lgb.train(params, tr_data, valid_sets=[tr_data, vl_data],
                         num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)

valid :  2021-04-22-US-SJC-1


[32m[I 2021-06-22 05:46:43,558][0m A new study created in memory with name: no-name-f1cb41d8-4088-4e90-87e4-fd512faf1394[0m
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.089312:  14%|#4        | 1/7 [00:07<00:42,  7.16s/it][32m[I 2021-06-22 05:46:50,723][0m Trial 0 finished with value: 119.08931225229111 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 119.08931225229111.[0m
feature_fraction, val_score: 119.089312:  14%|#4        | 1/7 [00:07<00:42,  7.16s/it]

[100]	valid_0's rmse: 25.53	valid_1's rmse: 195.172
Early stopping, best iteration is:
[1]	valid_0's rmse: 102.029	valid_1's rmse: 119.089
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.089312:  29%|##8       | 2/7 [00:16<00:42,  8.47s/it][32m[I 2021-06-22 05:47:00,109][0m Trial 1 finished with value: 119.68967546995832 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 119.08931225229111.[0m
feature_fraction, val_score: 119.089312:  29%|##8       | 2/7 [00:16<00:42,  8.47s/it]

[100]	valid_0's rmse: 25.7436	valid_1's rmse: 190.854
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.965	valid_1's rmse: 119.69
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.089312:  43%|####2     | 3/7 [00:23<00:31,  7.77s/it][32m[I 2021-06-22 05:47:07,040][0m Trial 2 finished with value: 120.19179063240325 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 119.08931225229111.[0m
feature_fraction, val_score: 119.089312:  43%|####2     | 3/7 [00:23<00:31,  7.77s/it]

[100]	valid_0's rmse: 25.7724	valid_1's rmse: 189.906
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.776	valid_1's rmse: 120.192
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.089312:  57%|#####7    | 4/7 [00:28<00:20,  6.85s/it][32m[I 2021-06-22 05:47:12,486][0m Trial 3 finished with value: 119.28772421169076 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 119.08931225229111.[0m
feature_fraction, val_score: 119.089312:  57%|#####7    | 4/7 [00:28<00:20,  6.85s/it]

[100]	valid_0's rmse: 25.9449	valid_1's rmse: 195.456
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.955	valid_1's rmse: 119.288
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.089312:  71%|#######1  | 5/7 [00:34<00:12,  6.48s/it][32m[I 2021-06-22 05:47:18,307][0m Trial 4 finished with value: 119.93610314801192 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 119.08931225229111.[0m
feature_fraction, val_score: 119.089312:  71%|#######1  | 5/7 [00:34<00:12,  6.48s/it]

[100]	valid_0's rmse: 25.7954	valid_1's rmse: 190.445
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.997	valid_1's rmse: 119.936
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.047561:  86%|########5 | 6/7 [00:42<00:06,  6.80s/it][32m[I 2021-06-22 05:47:25,742][0m Trial 5 finished with value: 119.0475613771437 and parameters: {'feature_fraction': 0.4}. Best is trial 5 with value: 119.0475613771437.[0m
feature_fraction, val_score: 119.047561:  86%|########5 | 6/7 [00:42<00:06,  6.80s/it]

[100]	valid_0's rmse: 25.8771	valid_1's rmse: 192.651
Early stopping, best iteration is:
[1]	valid_0's rmse: 102.029	valid_1's rmse: 119.048
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 119.047561: 100%|##########| 7/7 [00:50<00:00,  7.45s/it][32m[I 2021-06-22 05:47:34,535][0m Trial 6 finished with value: 119.08341216980659 and parameters: {'feature_fraction': 0.7}. Best is trial 5 with value: 119.0475613771437.[0m
feature_fraction, val_score: 119.047561: 100%|##########| 7/7 [00:50<00:00,  7.28s/it]
num_leaves, val_score: 119.047561:   0%|          | 0/20 [00:00<?, ?it/s]

[100]	valid_0's rmse: 25.7504	valid_1's rmse: 191.582
Early stopping, best iteration is:
[1]	valid_0's rmse: 102.035	valid_1's rmse: 119.083
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:   5%|5         | 1/20 [00:09<03:08,  9.90s/it][32m[I 2021-06-22 05:47:44,435][0m Trial 7 finished with value: 119.68833360926388 and parameters: {'num_leaves': 72}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:   5%|5         | 1/20 [00:09<03:08,  9.90s/it]

[100]	valid_0's rmse: 18.2776	valid_1's rmse: 194.19
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.416	valid_1's rmse: 119.688
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  10%|#         | 2/20 [00:24<03:47, 12.62s/it][32m[I 2021-06-22 05:47:58,958][0m Trial 8 finished with value: 119.75215044747308 and parameters: {'num_leaves': 189}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  10%|#         | 2/20 [00:24<03:47, 12.62s/it]

[100]	valid_0's rmse: 10.9174	valid_1's rmse: 192.205
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.784	valid_1's rmse: 119.752
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  15%|#5        | 3/20 [00:39<03:50, 13.56s/it][32m[I 2021-06-22 05:48:13,642][0m Trial 9 finished with value: 119.75215044747308 and parameters: {'num_leaves': 187}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  15%|#5        | 3/20 [00:39<03:50, 13.56s/it]

[100]	valid_0's rmse: 10.8943	valid_1's rmse: 192.126
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.789	valid_1's rmse: 119.752
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  20%|##        | 4/20 [00:55<03:53, 14.60s/it][32m[I 2021-06-22 05:48:29,841][0m Trial 10 finished with value: 119.70619842611067 and parameters: {'num_leaves': 229}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  20%|##        | 4/20 [00:55<03:53, 14.60s/it]

[100]	valid_0's rmse: 9.42069	valid_1's rmse: 192.776
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.674	valid_1's rmse: 119.706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  25%|##5       | 5/20 [01:08<03:33, 14.21s/it][32m[I 2021-06-22 05:48:43,353][0m Trial 11 finished with value: 119.81065064719363 and parameters: {'num_leaves': 156}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  25%|##5       | 5/20 [01:08<03:33, 14.21s/it]

[100]	valid_0's rmse: 12.3207	valid_1's rmse: 191.102
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.886	valid_1's rmse: 119.811
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  30%|###       | 6/20 [01:15<02:41, 11.57s/it][32m[I 2021-06-22 05:48:49,797][0m Trial 12 finished with value: 120.1469962058098 and parameters: {'num_leaves': 22}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  30%|###       | 6/20 [01:15<02:41, 11.57s/it]

[100]	valid_0's rmse: 29.0677	valid_1's rmse: 191.85
Early stopping, best iteration is:
[1]	valid_0's rmse: 102.348	valid_1's rmse: 120.147
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  35%|###5      | 7/20 [01:31<02:49, 13.01s/it][32m[I 2021-06-22 05:49:05,784][0m Trial 13 finished with value: 119.69375598891355 and parameters: {'num_leaves': 222}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  35%|###5      | 7/20 [01:31<02:49, 13.01s/it]

[100]	valid_0's rmse: 9.65384	valid_1's rmse: 192.069
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.687	valid_1's rmse: 119.694
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  40%|####      | 8/20 [01:40<02:21, 11.77s/it][32m[I 2021-06-22 05:49:14,902][0m Trial 14 finished with value: 119.7213987357888 and parameters: {'num_leaves': 60}. Best is trial 7 with value: 119.68833360926388.[0m
num_leaves, val_score: 119.047561:  40%|####      | 8/20 [01:40<02:21, 11.77s/it]

[100]	valid_0's rmse: 20.0223	valid_1's rmse: 195.175
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.558	valid_1's rmse: 119.721
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  45%|####5     | 9/20 [01:51<02:07, 11.63s/it][32m[I 2021-06-22 05:49:26,229][0m Trial 15 finished with value: 119.68512094327106 and parameters: {'num_leaves': 78}. Best is trial 15 with value: 119.68512094327106.[0m
num_leaves, val_score: 119.047561:  45%|####5     | 9/20 [01:51<02:07, 11.63s/it]

[100]	valid_0's rmse: 17.8917	valid_1's rmse: 195.141
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.356	valid_1's rmse: 119.685
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 9.1676	valid_1's rmse: 193.004


num_leaves, val_score: 119.047561:  50%|#####     | 10/20 [02:25<03:04, 18.49s/it][32m[I 2021-06-22 05:50:00,078][0m Trial 16 finished with value: 119.70358533945152 and parameters: {'num_leaves': 238}. Best is trial 15 with value: 119.68512094327106.[0m
num_leaves, val_score: 119.047561:  50%|#####     | 10/20 [02:25<03:04, 18.49s/it]

Early stopping, best iteration is:
[1]	valid_0's rmse: 100.659	valid_1's rmse: 119.704
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  55%|#####5    | 11/20 [02:38<02:31, 16.83s/it][32m[I 2021-06-22 05:50:13,152][0m Trial 17 finished with value: 119.39968047199699 and parameters: {'num_leaves': 111}. Best is trial 17 with value: 119.39968047199699.[0m
num_leaves, val_score: 119.047561:  55%|#####5    | 11/20 [02:38<02:31, 16.83s/it]

[100]	valid_0's rmse: 14.9679	valid_1's rmse: 192.038
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.098	valid_1's rmse: 119.4
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  60%|######    | 12/20 [02:50<02:02, 15.25s/it][32m[I 2021-06-22 05:50:24,796][0m Trial 18 finished with value: 119.3978752343847 and parameters: {'num_leaves': 110}. Best is trial 18 with value: 119.3978752343847.[0m
num_leaves, val_score: 119.047561:  60%|######    | 12/20 [02:50<02:02, 15.25s/it]

[100]	valid_0's rmse: 15.1189	valid_1's rmse: 194.175
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.104	valid_1's rmse: 119.398
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  65%|######5   | 13/20 [03:02<01:40, 14.30s/it][32m[I 2021-06-22 05:50:36,900][0m Trial 19 finished with value: 119.38564377352468 and parameters: {'num_leaves': 117}. Best is trial 19 with value: 119.38564377352468.[0m
num_leaves, val_score: 119.047561:  65%|######5   | 13/20 [03:02<01:40, 14.30s/it]

[100]	valid_0's rmse: 14.3621	valid_1's rmse: 190.057
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.066	valid_1's rmse: 119.386
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  70%|#######   | 14/20 [03:15<01:23, 13.86s/it][32m[I 2021-06-22 05:50:49,741][0m Trial 20 finished with value: 119.50426904320656 and parameters: {'num_leaves': 131}. Best is trial 19 with value: 119.38564377352468.[0m
num_leaves, val_score: 119.047561:  70%|#######   | 14/20 [03:15<01:23, 13.86s/it]

[100]	valid_0's rmse: 13.7466	valid_1's rmse: 191.765
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.991	valid_1's rmse: 119.504
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  75%|#######5  | 15/20 [03:27<01:07, 13.49s/it][32m[I 2021-06-22 05:51:02,389][0m Trial 21 finished with value: 119.399680471997 and parameters: {'num_leaves': 111}. Best is trial 19 with value: 119.38564377352468.[0m
num_leaves, val_score: 119.047561:  75%|#######5  | 15/20 [03:27<01:07, 13.49s/it]

[100]	valid_0's rmse: 14.9679	valid_1's rmse: 192.038
Early stopping, best iteration is:
[1]	valid_0's rmse: 101.098	valid_1's rmse: 119.4
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474
[LightGBM] [Info] Start training from score 192.148234
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 119.047561:  80%|########  | 16/20 [03:59<01:15, 18.91s/it][32m[I 2021-06-22 05:51:33,868][0m Trial 22 finished with value: 119.53584776716588 and parameters: {'num_leaves': 149}. Best is trial 19 with value: 119.38564377352468.[0m
num_leaves, val_score: 119.047561:  80%|########  | 16/20 [03:59<01:15, 18.91s/it]

[100]	valid_0's rmse: 12.6985	valid_1's rmse: 190.844
Early stopping, best iteration is:
[1]	valid_0's rmse: 100.912	valid_1's rmse: 119.536
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120623
[LightGBM] [Info] Number of data points in the train set: 125626, number of used features: 474


KeyboardInterrupt: 

In [None]:
study.params

In [None]:
opt_params = study.params

In [None]:
collections = ['2021-04-22-US-SJC-1', '2021-04-29-US-SJC-2', '2021-04-28-US-SJC-1']

oof1 = pd.DataFrame()
imp1 = pd.DataFrame()
test_preds = np.zeros(len(test))
n = len(collections)

for collection in collections:
    print('valid : ', collection)
    tr_idx = train[train['collectionName']!=collection].index
    vl_idx = train[train['collectionName']==collection].index
    tr_x, tr_y = train[features].iloc[tr_idx], train[target1].iloc[tr_idx]
    vl_x, vl_y = train[features].iloc[vl_idx], train[target1].iloc[vl_idx]
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)
    
    model = lgb.train(opt_params, tr_data, valid_sets=[tr_data, vl_data],
                      num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)
    vl_pred = model.predict(vl_x, num_iteration=model.best_iteration)
    
    oof_tmp = train.iloc[vl_idx].copy()
    oof_tmp['pred'] = vl_pred
    oof1 = oof1.append(oof_tmp)
    
    imp_tmp = pd.DataFrame()
    imp_tmp['feature'] = model.feature_name()
    imp_tmp['importance'] = model.feature_importance()
    imp_tmp['valid_collection'] = collection
    imp1 = imp1.append(imp_tmp)
    
    #pred = model.predict(test[features], num_iteration=model.best_iteration)
    #test_preds += pred / n
#test['pred'] = test_preds

# 可視化・評価

In [None]:
for phone in oof1['phone'].unique():

    fig, axes = plt.subplots(figsize=(30, 10), nrows=1,sharex=True)
    tmp = oof1[oof1['phone']==phone].copy()
    
    axes.plot(tmp['millisSinceGpsEpoch'], tmp['deg'], label='gt')
    axes.plot(tmp['millisSinceGpsEpoch'], tmp['calc_deg'], label='calc')
    axes.plot(tmp['millisSinceGpsEpoch'], tmp['pred'], label='pred')
    axes.legend(loc='upper right')
    axes.grid(color='g', linestyle=':', linewidth=0.3)

    fig.suptitle(phone, fontsize=16)
    fig.savefig(OUTPUT + '/' + phone + '.png')
    plt.close()