# eda022
rawの確認

In [1]:
import os
import pandas as pd
import numpy as np
import ipynb_path
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
import plotly
import plotly.express as px
%matplotlib inline
pd.set_option('display.max_rows', 200)
from math import * 
import warnings
warnings.simplefilter('ignore')

In [2]:
def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=1000,
                            width=2000)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()

In [4]:
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# データ読み込み

In [10]:
train_raw = pd.read_csv(INPUT + '/prep/gnss/train/Raw.csv')

In [15]:
train_raw.head(1000000).to_csv('check.csv')

In [8]:
train_raw

'../input/google-smartphone-decimeter-challenge/prep/gnss/train/raw.csv'

In [7]:
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')

ground_truth = ground_truth.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
gt = ground_truth[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg_gt', 'lngDeg_gt', 'speedMps', 'courseDegree']].copy()
train = train.merge(ground_truth, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')

# 連続静止範囲をグルーピング

In [9]:
train['speed0'] = 0
train.loc[train['speedMps']==0, 'speed0'] = 1

In [11]:
sp0_idx = train[train['speed0']==1].index
prev_not_sp0_idx = train[train['speed0'].shift(1)==0].index
head_idx = train.groupby('phone').head(1).index

train.loc[list(set(sp0_idx) & set(prev_not_sp0_idx)), 'g'] = 1
train.loc[list(set(sp0_idx) & set(head_idx)), 'g'] = 1
train['g'] = train['g'].cumsum()
train['g'] = train['g'].fillna(method='ffill')
train.loc[train['speed0']==0, 'g'] = np.nan

In [14]:
for g in train['g'].unique():
    tmp = train[train['g']==g].copy()
    break

In [39]:
for g in train['g'].dropna().unique():

    tmp = train[train['g']==g].copy()
    lat_mean = tmp['latDeg'].mean()
    lng_mean = tmp['lngDeg'].mean()
    lat_gt_mean = tmp['latDeg_gt'].mean()
    lng_gt_mean = tmp['lngDeg_gt'].mean()
    err = calc_haversine(lat_mean, lng_mean, lat_gt_mean, lng_gt_mean)
    g_str = str(int(g)).zfill(3)
    phone = tmp.iloc[0]['phone']
    name = phone + '_' + g_str

    print(name, err)

    fig, axes = plt.subplots(figsize=(10, 10), nrows=1)
    axes.scatter(tmp['latDeg'], tmp['lngDeg'], label='baseline')
    axes.scatter(lat_mean, lng_mean, label='base_mean')
    axes.scatter(tmp['latDeg_gt'], tmp['lngDeg_gt'], label='gt')
    axes.legend(loc='upper right')
    axes.grid(color='g', linestyle=':', linewidth=0.3)
    fig.suptitle(name + '  |  ERR : ' + str(err), fontsize=16)
    fig.savefig(OUTPUT + '/' + name + '.png')
    plt.close()

2020-05-14-US-MTV-1_Pixel4_001 0.7968259335360792
2020-05-14-US-MTV-1_Pixel4_002 0.315299240249682
2020-05-14-US-MTV-1_Pixel4_003 0.7204820022640503
2020-05-14-US-MTV-1_Pixel4XLModded_004 0.36342924636565954
2020-05-14-US-MTV-1_Pixel4XLModded_005 1.498833339996636
2020-05-14-US-MTV-1_Pixel4XLModded_006 0.9381591065624958
2020-05-14-US-MTV-2_Pixel4_007 0.7618527757792031
2020-05-14-US-MTV-2_Pixel4_008 2.0963383058693688
2020-05-14-US-MTV-2_Pixel4_009 2.8391696035488008
2020-05-14-US-MTV-2_Pixel4XLModded_010 2.067088185039384
2020-05-21-US-MTV-1_Pixel4_011 3.1418636477494926
2020-05-21-US-MTV-1_Pixel4_012 2.8969398392927634
2020-05-21-US-MTV-1_Pixel4_013 3.708995696038802
2020-05-21-US-MTV-1_Pixel4_014 1.8048587511202236
2020-05-21-US-MTV-2_Pixel4_015 1.9224503743351888
2020-05-21-US-MTV-2_Pixel4_016 1.8403367678304479
2020-05-21-US-MTV-2_Pixel4_017 3.7061000257087477
2020-05-21-US-MTV-2_Pixel4_018 3.35881222314819
2020-05-21-US-MTV-2_Pixel4_019 7.879817394965593
2020-05-21-US-MTV-2_Pixe

In [21]:
err

0.7968259335360792