# eda024_collection's day
osrデータとの突合を考えるために各collectionの時刻を確認する

In [21]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 
import scipy.optimize as opt
import multiprocessing
pd.set_option('display.max_columns', 50)
import datetime

In [2]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [3]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [5]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [6]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [7]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [8]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [10]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')
    return base_train, base_test, sample_sub, ground_truth

In [14]:
train, test, sub, gt = get_data()

In [16]:
train['utc'] = train['millisSinceGpsEpoch'] + 315964800000 - 18000
test['utc'] = test['millisSinceGpsEpoch'] + 315964800000 - 18000

In [25]:
train['timestamp'] = pd.to_datetime(train['utc'], unit='ms')
test['timestamp'] = pd.to_datetime(test['utc'], unit='ms')

In [27]:
train_grouped = train.groupby('collectionName')['timestamp'].agg(['min', 'max']).reset_index()
test_grouped = test.groupby('collectionName')['timestamp'].agg(['min', 'max']).reset_index()

In [30]:
train_grouped['train_test'] = 'train'
test_grouped['train_test'] = 'test'

In [31]:
result = pd.concat([train_grouped, test_grouped])

In [34]:
result = result.sort_values('min')[['collectionName', 'train_test', 'min', 'max']]
result

Unnamed: 0,collectionName,train_test,min,max
0,2020-05-14-US-MTV-1,train,2020-05-14 22:10:45.442,2020-05-14 22:39:53.449
1,2020-05-14-US-MTV-2,train,2020-05-15 00:46:52.442,2020-05-15 01:16:21.442
0,2020-05-15-US-MTV-1,test,2020-05-15 20:12:14.446,2020-05-15 21:10:57.632
2,2020-05-21-US-MTV-1,train,2020-05-21 18:09:35.431,2020-05-21 18:43:29.431
3,2020-05-21-US-MTV-2,train,2020-05-21 21:22:23.444,2020-05-21 21:55:10.434
1,2020-05-28-US-MTV-1,test,2020-05-28 19:07:38.439,2020-05-28 19:48:05.439
2,2020-05-28-US-MTV-2,test,2020-05-28 21:23:21.439,2020-05-28 22:02:00.446
4,2020-05-29-US-MTV-1,train,2020-05-29 22:44:27.440,2020-05-29 23:16:32.440
5,2020-05-29-US-MTV-2,train,2020-05-29 23:27:20.447,2020-05-30 00:00:45.447
6,2020-06-04-US-MTV-1,train,2020-06-04 20:57:55.441,2020-06-04 21:27:26.434


In [35]:
result.to_csv(OUTPUT + '/time_table.csv', index=False)