# eda015
gnss_logの特徴量検討(cumsum)

In [1]:
import os
import pandas as pd
import numpy as np
import ipynb_path
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
import plotly
import plotly.express as px
%matplotlib inline
pd.set_option('display.max_rows', 200)

In [2]:
def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=1000,
                            width=2000)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()

In [4]:
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

## 異常値のクリッピングを加えた特徴量生成

In [7]:
accel_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalAccel.csv')
gyro_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalGyro.csv')
mag_train = pd.read_csv(INPUT + '/prep/gnss/train/UncalMag.csv')
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')

train['speedMps'] = ground_truth['speedMps']

In [8]:
def add_sensor_features(df, accel, gyro, mag):
    # phoneを追加
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    accel['phone'] = accel['collectionName'] + '_' + accel['phoneName']
    gyro['phone'] = gyro['collectionName'] + '_' + gyro['phoneName']
    mag['phone'] = mag['collectionName'] + '_' + mag['phoneName']
    
    # utc -> gps
    accel['millisSinceGpsEpoch'] = accel['utcTimeMillis'] - 315964800000 + 18000
    gyro['millisSinceGpsEpoch'] = gyro['utcTimeMillis'] - 315964800000 + 18000
    mag['millisSinceGpsEpoch'] = mag['utcTimeMillis'] - 315964800000 + 18000
    
    # resampling追加
    df['secondSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    accel['secondSinceGpsEpoch'] = accel['millisSinceGpsEpoch'] // 1000
    gyro['secondSinceGpsEpoch'] = gyro['millisSinceGpsEpoch'] // 1000
    mag['secondSinceGpsEpoch'] = mag['millisSinceGpsEpoch'] // 1000
    
    # clipping
    accel[['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2']] = accel.groupby('phone')['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    gyro[['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec']] = gyro.groupby('phone')['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    mag[['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT']] = mag.groupby('phone')['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].transform(lambda x: x.clip(x.quantile(0.001), x.quantile(0.999)))
    
    accel = accel.groupby(['phone', 'secondSinceGpsEpoch'])['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2'].agg(['mean', 'std']).reset_index()
    accel.columns = ['phone', 'secondSinceGpsEpoch', 'UncalAccelXMps2_mean', 'UncalAccelXMps2_std', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_std', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_std']
    gyro = gyro.groupby(['phone', 'secondSinceGpsEpoch'])['UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec'].agg(['mean', 'std']).reset_index()
    gyro.columns = ['phone', 'secondSinceGpsEpoch', 'UncalGyroXRadPerSec_mean', 'UncalGyroXRadPerSec_std', 'UncalGyroYRadPerSec_mean', 'UncalGyroYRadPerSec_std', 'UncalGyroZRadPerSec_mean', 'UncalGyroZRadPerSec_std' ]
    mag = mag.groupby(['phone', 'secondSinceGpsEpoch'])['UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT'].agg(['mean', 'std']).reset_index()
    mag.columns = ['phone', 'secondSinceGpsEpoch', 'UncalMagXMicroT_mean', 'UncalMagXMicroT_std', 'UncalMagYMicroT_mean', 'UncalMagYMicroT_std', 'UncalMagZMicroT_mean', 'UncalMagZMicroT_std']
        
    df = df.merge(accel, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(gyro, on=['phone', 'secondSinceGpsEpoch'], how='left')
    df = df.merge(mag, on=['phone', 'secondSinceGpsEpoch'], how='left')
    
    for c in ['UncalAccelXMps2_mean', 'UncalAccelYMps2_mean', 'UncalAccelZMps2_mean', 'UncalGyroXRadPerSec_mean', 'UncalGyroYRadPerSec_mean', 'UncalGyroZRadPerSec_mean']:
        df[c] = df[c] - df[c].median() 
        df[c + '_cumcum'] = df.groupby('phone')[c].cumsum()    
    
    df.drop(['secondSinceGpsEpoch'], axis=1, inplace=True)
    
    return df

In [9]:
train = add_sensor_features(train, accel_train, gyro_train, mag_train)



In [10]:
def viz(df, cols, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    cols_n = len(cols)
    phones = df['phone'].unique()
    
    for phone in phones:
        df_tmp = df[df['phone']==phone].copy()
        
        fig, axes = plt.subplots(figsize=(5*cols_n, 20), nrows=cols_n,sharex=True)
        for i,c in enumerate(cols):
            axes[i].plot(df_tmp['millisSinceGpsEpoch'], df_tmp[c], label=c)
            axes[i].legend(loc='upper right')
            axes[i].grid(color='g', linestyle=':', linewidth=0.3)
        fig.suptitle(phone, fontsize=16)
        fig.savefig(output_dir + '/' + phone + '.png')
        plt.close()

In [11]:
for c in train.columns:
    print(c)

collectionName
phoneName
millisSinceGpsEpoch
latDeg
lngDeg
heightAboveWgs84EllipsoidM
phone
speedMps
UncalAccelXMps2_mean
UncalAccelXMps2_std
UncalAccelYMps2_mean
UncalAccelYMps2_std
UncalAccelZMps2_mean
UncalAccelZMps2_std
UncalGyroXRadPerSec_mean
UncalGyroXRadPerSec_std
UncalGyroYRadPerSec_mean
UncalGyroYRadPerSec_std
UncalGyroZRadPerSec_mean
UncalGyroZRadPerSec_std
UncalMagXMicroT_mean
UncalMagXMicroT_std
UncalMagYMicroT_mean
UncalMagYMicroT_std
UncalMagZMicroT_mean
UncalMagZMicroT_std
UncalAccelXMps2_mean_cumcum
UncalAccelYMps2_mean_cumcum
UncalAccelZMps2_mean_cumcum
UncalGyroXRadPerSec_mean_cumcum
UncalGyroYRadPerSec_mean_cumcum
UncalGyroZRadPerSec_mean_cumcum


In [12]:
train['UncalAccelXMps2_mean_cumcum']

0           0.488407
1           0.977332
2           1.466586
3           1.957340
4           2.451204
             ...    
131337   -607.846659
131338   -609.132258
131339   -610.419149
131340   -611.704795
131341   -612.994943
Name: UncalAccelXMps2_mean_cumcum, Length: 131342, dtype: float64

In [13]:
viz(train, 
    cols=['speedMps', 'UncalAccelXMps2_mean', 'UncalAccelXMps2_mean_cumcum', 'UncalAccelYMps2_mean', 'UncalAccelYMps2_mean_cumcum', 'UncalAccelZMps2_mean', 'UncalAccelZMps2_mean_cumcum'],
    output_dir = OUTPUT + '/gnss_ts/train/accel_clip')