# area_labeling

In [198]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 
import scipy.optimize as opt
import multiprocessing
pd.set_option('display.max_columns', 50)

In [199]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [200]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/prep/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [201]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [202]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [213]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')
    return base_train, base_test, sample_sub, ground_truth

In [214]:
# train data labeling
g1 = ['2020-05-14-US-MTV-1', '2020-05-14-US-MTV-2', '2020-05-21-US-MTV-1', '2020-05-21-US-MTV-2',
      '2020-05-29-US-MTV-1', '2020-05-29-US-MTV-2', '2020-06-04-US-MTV-1', '2020-06-05-US-MTV-1',
      '2020-06-05-US-MTV-2', '2020-06-11-US-MTV-1', '2020-07-08-US-MTV-1', '2020-07-17-US-MTV-1',
      '2020-07-17-US-MTV-2', '2020-08-03-US-MTV-1', '2020-08-06-US-MTV-2', '2020-09-04-US-SF-1',
      '2020-09-04-US-SF-2',  '2021-01-04-US-RWC-1', '2021-01-04-US-RWC-2',
      '2020-05-15-US-MTV-1', '2020-05-28-US-MTV-1', '2020-05-28-US-MTV-2', '2020-06-04-US-MTV-2',
      '2020-06-10-US-MTV-1', '2020-06-10-US-MTV-2', '2020-08-03-US-MTV-2', '2020-08-13-US-MTV-1',
      '2021-03-16-US-MTV-2']

g2 = ['2021-01-05-US-SVL-1', '2021-01-05-US-SVL-2', '2021-04-15-US-MTV-1', 
      '2021-03-25-US-PAO-1', '2021-04-02-US-SJC-1', '2021-04-08-US-MTV-1']

g3 = ['2021-03-10-US-SVL-1', '2021-04-26-US-SVL-1', '2021-04-26-US-SVL-2']

g4 = ['2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1', 
      '2021-03-16-US-RWC-2', '2021-04-21-US-MTV-1', '2021-04-28-US-MTV-2', '2021-04-29-US-MTV-2']

g5 = ['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2', 
      '2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3']

In [215]:
train, test, sub, gt = get_data()

In [216]:
train = train.groupby('collectionName').agg({'latDeg':['mean', 'max', 'min'], 'lngDeg':['mean', 'max', 'min']}).reset_index()
train.columns = ['collectionName', 'lat_mean', 'lat_max', 'lat_min', 'lng_mean', 'lng_max', 'lng_min']
test = test.groupby('collectionName').agg({'latDeg':['mean', 'max', 'min'], 'lngDeg':['mean', 'max', 'min']}).reset_index()
test.columns = ['collectionName', 'lat_mean', 'lat_max', 'lat_min', 'lng_mean', 'lng_max', 'lng_min']

In [217]:
train.loc[train['collectionName'].isin(g1), 'g'] = 1
train.loc[train['collectionName'].isin(g2), 'g'] = 2
train.loc[train['collectionName'].isin(g3), 'g'] = 3
train.loc[train['collectionName'].isin(g4), 'g'] = 4
train.loc[train['collectionName'].isin(g5), 'g'] = 5
train.loc[train['g'].isin([1]), 'g_tmp'] = 1
train.loc[train['g'].isin([2,3,4,5]), 'g_tmp'] = 2

test.loc[test['collectionName'].isin(g1), 'g_hand'] = 1
test.loc[test['collectionName'].isin(g2), 'g_hand'] = 2
test.loc[test['collectionName'].isin(g3), 'g_hand'] = 3
test.loc[test['collectionName'].isin(g4), 'g_hand'] = 4
test.loc[test['collectionName'].isin(g5), 'g_hand'] = 5
test.loc[test['g_hand'].isin([1]), 'g_hand_tmp'] = 1
test.loc[test['g_hand'].isin([2,3,4,5]), 'g_hand_tmp'] = 2

# kNN

In [218]:
from sklearn.neighbors import KNeighborsClassifier

In [219]:
train.columns

Index(['collectionName', 'lat_mean', 'lat_max', 'lat_min', 'lng_mean',
       'lng_max', 'lng_min', 'g', 'g_tmp'],
      dtype='object')

In [220]:
features = ['lat_mean', 'lat_max', 'lat_min', 'lng_mean', 'lng_max', 'lng_min']

In [221]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train[features], train[target])
pred = knn.predict(test[features])
test['g_tmp'] = pred

In [223]:
test['g'] = test['g_tmp']

In [225]:
test

Unnamed: 0,collectionName,lat_mean,lat_max,lat_min,lng_mean,lng_max,lng_min,g_hand,g_hand_tmp,g_tmp,g
0,2020-05-15-US-MTV-1,37.450502,37.633212,37.33283,-122.195567,-122.058927,-122.433247,1.0,1.0,1.0,1.0
1,2020-05-28-US-MTV-1,37.532438,37.634628,37.422308,-122.266066,-122.090293,-122.426898,1.0,1.0,1.0,1.0
2,2020-05-28-US-MTV-2,37.585822,37.778231,37.419663,-122.302672,-122.09011,-122.407232,1.0,1.0,1.0,1.0
3,2020-06-04-US-MTV-2,37.528699,37.655852,37.415596,-122.254494,-122.080118,-122.409153,1.0,1.0,1.0,1.0
4,2020-06-10-US-MTV-1,37.526831,37.634593,37.412827,-122.256377,-122.078198,-122.420161,1.0,1.0,1.0,1.0
5,2020-06-10-US-MTV-2,37.535218,37.655796,37.415444,-122.261622,-122.079942,-122.409124,1.0,1.0,1.0,1.0
6,2020-08-03-US-MTV-2,37.533158,37.63413,37.419678,-122.264104,-122.069896,-122.423899,1.0,1.0,1.0,1.0
7,2020-08-13-US-MTV-1,37.435028,37.52544,37.332913,-122.19602,-122.058889,-122.3551,1.0,1.0,1.0,1.0
8,2021-03-16-US-MTV-2,37.451928,37.498938,37.433239,-122.265406,-122.231752,-122.316337,1.0,1.0,1.0,1.0
9,2021-03-16-US-RWC-2,37.38909,37.395966,37.382667,-122.08947,-122.075752,-122.103254,4.0,2.0,2.0,2.0


In [248]:
tor = 0.001
for i in [3,4,5]:
    lat_mean, lng_mean = train[train['g']==i][['lat_mean','lng_mean']].mean()
    test['lat_mean_diff'] = abs(test['lat_mean'] - lat_mean)
    test['lng_mean_diff'] = abs(test['lng_mean'] - lng_mean)
    test.loc[(test['lat_mean_diff']<tor) & (test['lng_mean_diff']<tor), 'g'] = i
    test =test.drop(columns=['lat_mean_diff', 'lng_mean_diff'])

In [250]:
test[['collectionName', 'g_hand', 'g']]

Unnamed: 0,collectionName,g_hand,g
0,2020-05-15-US-MTV-1,1.0,1.0
1,2020-05-28-US-MTV-1,1.0,1.0
2,2020-05-28-US-MTV-2,1.0,1.0
3,2020-06-04-US-MTV-2,1.0,1.0
4,2020-06-10-US-MTV-1,1.0,1.0
5,2020-06-10-US-MTV-2,1.0,1.0
6,2020-08-03-US-MTV-2,1.0,1.0
7,2020-08-13-US-MTV-1,1.0,1.0
8,2021-03-16-US-MTV-2,1.0,1.0
9,2021-03-16-US-RWC-2,4.0,4.0


In [251]:
output_df = pd.DataFrame()
output_df = output_df.append(train[['collectionName', 'g']])
output_df = output_df.append(test[['collectionName', 'g']])

In [253]:
output_df.to_csv(f'{OUTPUT}/result.csv')

In [266]:
sorted(list(output_df[output_df['g']==5]['collectionName']))

['2021-04-22-US-SJC-1',
 '2021-04-22-US-SJC-2',
 '2021-04-28-US-SJC-1',
 '2021-04-29-US-SJC-2',
 '2021-04-29-US-SJC-3']

In [267]:
sorted(g5)

['2021-04-22-US-SJC-1',
 '2021-04-22-US-SJC-2',
 '2021-04-28-US-SJC-1',
 '2021-04-29-US-SJC-2',
 '2021-04-29-US-SJC-3']