# baseline_g1_v001
rawPrUncMでのフィルタリング  
キャリアスムージング  
Mi8を除外できていなかったので修正

In [1]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import itertools
import lightgbm as lgb
from optuna.integration import lightgbm as optuna_lgb
import simdkalman
import optuna
import pyproj
from pyproj import Proj, transform
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, accuracy_score
pd.set_option('display.max_rows', 100)
from math import * 
import scipy.optimize as opt
import multiprocessing
pd.set_option('display.max_columns', 50)

In [2]:
g1 = ['2020-05-14-US-MTV-1', '2020-05-14-US-MTV-2', '2020-05-21-US-MTV-1', '2020-05-21-US-MTV-2',
      '2020-05-29-US-MTV-1', '2020-05-29-US-MTV-2', '2020-06-04-US-MTV-1', '2020-06-05-US-MTV-1',
      '2020-06-05-US-MTV-2', '2020-06-11-US-MTV-1', '2020-07-08-US-MTV-1', '2020-07-17-US-MTV-1',
      '2020-07-17-US-MTV-2', '2020-08-03-US-MTV-1', '2020-08-06-US-MTV-2', '2020-09-04-US-SF-1',
      '2020-09-04-US-SF-2',  '2021-01-04-US-RWC-1', '2021-01-04-US-RWC-2',
      '2020-05-15-US-MTV-1', '2020-05-28-US-MTV-1', '2020-05-28-US-MTV-2', '2020-06-04-US-MTV-2',
      '2020-06-10-US-MTV-1', '2020-06-10-US-MTV-2', '2020-08-03-US-MTV-2', '2020-08-13-US-MTV-1',
      '2021-03-16-US-MTV-2']

g2 = ['2021-01-05-US-SVL-1', '2021-01-05-US-SVL-2', '2021-04-15-US-MTV-1', 
      '2021-03-25-US-PAO-1', '2021-04-02-US-SJC-1', '2021-04-08-US-MTV-1']

g3 = ['2021-03-10-US-SVL-1', '2021-04-26-US-SVL-1', '2021-04-26-US-SVL-2']

g4 = ['2021-04-28-US-MTV-1', '2021-04-29-US-MTV-1', 
      '2021-03-16-US-RWC-2', '2021-04-21-US-MTV-1', '2021-04-28-US-MTV-2', '2021-04-29-US-MTV-2']

g5 = ['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2', 
      '2021-04-22-US-SJC-2', '2021-04-29-US-SJC-3']

In [3]:
target = g1

In [4]:
elev_deg = 9
used_in_fix = 1
rawpruncm = 46
csr = 0.6

In [5]:
import ipynb_path

def get_nb_name():
    nb_path = ipynb_path.get()
    nb_name = nb_path.rsplit('/',1)[1].replace('.ipynb','')
    return nb_name

In [6]:
# directory setting
nb_name = get_nb_name()
INPUT = '../input/google-smartphone-decimeter-challenge'
OUTPUT = '../output/prep/' + nb_name
os.makedirs(OUTPUT, exist_ok=True)

# utils

In [7]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [8]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [9]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [10]:
# ground_truth
def get_ground_truth():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)

    return ground_truth

In [11]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

In [12]:
class train_result:
    def __init__(self, df):
        self.df = df
        self.gt = get_ground_truth()
        self.bl = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
        
        self.gt = self.gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
        self.df = self.df.merge(self.gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
        self.df['phone'] = self.df['collectionName'] + '_' + self.df['phoneName']
        self.df['err'] =  calc_haversine(self.df['latDeg_gt'], self.df['lngDeg_gt'], self.df['latDeg'], self.df['lngDeg'])
        
        self.phone_res = self.calc_err('phone')
        self.clc_res = self.calc_err('collectionName')
        self.phonename_res = self.calc_err('phoneName')
        
    def calc_err(self, by):
        res = self.df.groupby(by)['err'].agg([percentile50, percentile95])
        res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2
        return res
    
    @property
    def score(self):
        return self.phone_res['p50_p90_mean'].mean()
    @property
    def raw_data(self):
        return self.df
    @property
    def err(self):
        return self.phone_res
    @property
    def collection_err(self):
        return self.clc_res
    @property
    def phonename_err(self):
        return self.phonename_res
    
    def viz_map(self, collection, show_gt=True, show_bl=True):
        tmp = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp2 = self.df[self.df['collectionName']==collection][['collectionName', 'phoneName', 'latDeg_gt', 'lngDeg_gt']]
        tmp2 = tmp2.rename(columns={'latDeg_gt':'latDeg', 'lngDeg_gt':'lngDeg'})
        tmp2['phoneName'] = tmp2['phoneName'] + '_GT'
        tmp3 = self.bl[self.bl['collectionName']==collection][['collectionName', 'phoneName', 'latDeg', 'lngDeg']]
        tmp3['phoneName'] = tmp3['phoneName'] + '_BL'
        
        if show_gt:
            tmp = tmp.append(tmp2)
        if show_bl:
            tmp = tmp.append(tmp3)
        visualize_collection(tmp, collection)

In [13]:
def get_data():
    base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
    base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
    sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')
    ground_truth = pd.read_csv(INPUT + '/prep/ground_truth_train.csv')
    return base_train, base_test, sample_sub, ground_truth

In [14]:
def ecef2lla(x, y, z):
    # x, y and z are scalars or vectors in meters
    x = np.array([x]).reshape(np.array([x]).shape[-1], 1)
    y = np.array([y]).reshape(np.array([y]).shape[-1], 1)
    z = np.array([z]).reshape(np.array([z]).shape[-1], 1)

    a=6378137
    a_sq=a**2
    e = 8.181919084261345e-2
    e_sq = 6.69437999014e-3

    f = 1/298.257223563
    b = a*(1-f)

    # calculations:
    r = np.sqrt(x**2 + y**2)
    ep_sq  = (a**2-b**2)/b**2
    ee = (a**2-b**2)
    f = (54*b**2)*(z**2)
    g = r**2 + (1 - e_sq)*(z**2) - e_sq*ee*2
    c = (e_sq**2)*f*r**2/(g**3)
    s = (1 + c + np.sqrt(c**2 + 2*c))**(1/3.)
    p = f/(3.*(g**2)*(s + (1./s) + 1)**2)
    q = np.sqrt(1 + 2*p*e_sq**2)
    r_0 = -(p*e_sq*r)/(1+q) + np.sqrt(0.5*(a**2)*(1+(1./q)) - p*(z**2)*(1-e_sq)/(q*(1+q)) - 0.5*p*(r**2))
    u = np.sqrt((r - e_sq*r_0)**2 + z**2)
    v = np.sqrt((r - e_sq*r_0)**2 + (1 - e_sq)*z**2)
    z_0 = (b**2)*z/(a*v)
    h = u*(1 - b**2/(a*v))
    phi = np.arctan((z + ep_sq*z_0)/r)
    lambd = np.arctan2(y, x)

    return phi*180/np.pi, lambd*180/np.pi, h

# baselineの再作成

In [15]:
def prepare_calc_baseline(df):
    light_speed = 299_792_458
    omega_e = 7.2921151467e-5
    
    df['sat'] = df['svid'].astype('str') + '_' + df['signalType']
    df['isrbM'] = df.groupby('sat')['isrbM'].transform('median')
    
    # Corrected pseudorange according to data instructions
    df['correctedPrM'] = df['rawPrM'] + \
                         df['satClkBiasM'] - \
                         df['isrbM'] - \
                         df['ionoDelayM'] - \
                         df['tropoDelayM']
    
    # Time it took for signal to travel
    df['transmissionTimeSeconds'] = df['correctedPrM'] / light_speed
    
    # Compute true sat positions at arrival time
    df['xSatPosMRotated'] = \
        np.cos(omega_e * df['transmissionTimeSeconds']) * df['xSatPosM'] \
        + np.sin(omega_e * df['transmissionTimeSeconds']) * df['ySatPosM']

    df['ySatPosMRotated'] = \
        - np.sin(omega_e * df['transmissionTimeSeconds']) * df['xSatPosM'] \
        + np.cos(omega_e * df['transmissionTimeSeconds']) * df['ySatPosM']

    df['zSatPosMRotated'] = df['zSatPosM']
    
    # Uncertainty weight for the WLS method
    df['uncertaintyWeight'] = 1 / df['rawPrUncM']
    
    df = carrier_smoothing(df, csr)
    
    return df

In [16]:
def calc_baseline_point(df):

    def distance(sat_pos, x):
        sat_pos_diff = sat_pos.copy(deep=True)

        sat_pos_diff['xSatPosMRotated'] = sat_pos_diff['xSatPosMRotated'] - x[0]
        sat_pos_diff['ySatPosMRotated'] = sat_pos_diff['ySatPosMRotated'] - x[1]
        sat_pos_diff['zSatPosMRotated'] = sat_pos_diff['zSatPosMRotated'] - x[2]

        sat_pos_diff['d'] = sat_pos_diff['uncertaintyWeight'] * \
                            (np.sqrt((sat_pos_diff['xSatPosMRotated']**2 + sat_pos_diff['ySatPosMRotated']**2 + sat_pos_diff['zSatPosMRotated']**2)) + \
                             x[3] - sat_pos_diff['correctedPrM'])

        return sat_pos_diff['d']

    def distance_fixed_satpos(x):
        return distance(df[['xSatPosMRotated', 'ySatPosMRotated', 'zSatPosMRotated', 'correctedPrM', 'uncertaintyWeight']], x)
    
    x0 = [0,0,0,0]
    opt_res = opt.least_squares(distance_fixed_satpos, x0)
    # Optimiser yields a position in the ECEF coordinates
    opt_res_pos = opt_res.x
    
    # ECEF position to lat/long
    wls_estimated_pos = ecef2lla(*opt_res_pos[:3])
    wls_estimated_pos = np.squeeze(wls_estimated_pos)
    
    return wls_estimated_pos[0], wls_estimated_pos[1]

In [17]:
def get_derived_data(train_test, collection, phonename):
    derived = pd.read_csv(INPUT + f'/{train_test}/{collection}/{phonename}/{phonename}_derived.csv')
    raw = pd.read_csv(INPUT + f'/prep/gnss/{train_test}/{collection}/{phonename}/Raw.csv')
    status = pd.read_csv(INPUT + f'/prep/gnss/{train_test}/{collection}/{phonename}/Status.csv')
    
    # Assume we've loaded a dataframe from _GnssLog.txt for only lines beginning with "Raw", we denote this df_raw. Next, assume we've loaded a dataframe from _derived.csv. We denote this df_derived.

    # Create a new column in df_raw that corresponds to df_derived['MillisSinceGpsEpoch']
    raw['millisSinceGpsEpoch'] = np.floor( (raw['TimeNanos'] - raw['FullBiasNanos']) / 1000000.0).astype(int)
    
    # Change each value in df_derived['MillisSinceGpsEpoch'] to be the prior epoch.
    raw_timestamps = raw['millisSinceGpsEpoch'].unique()
    derived_timestamps = derived['millisSinceGpsEpoch'].unique()

    # The timestamps in derived are one epoch ahead. We need to map each epoch
    # in derived to the prior one (in Raw).
    indexes = np.searchsorted(raw_timestamps, derived_timestamps)
    from_t_to_fix_derived = dict(zip(derived_timestamps, raw_timestamps[indexes-1]))
    derived['millisSinceGpsEpoch'] = np.array(list(map(lambda v: from_t_to_fix_derived[v], derived['millisSinceGpsEpoch'])))
    
    delta_millis = derived['millisSinceGpsEpoch'] - derived['receivedSvTimeInGpsNanos'] / 1e6
    where_good_signals = (delta_millis > 0) & (delta_millis < 300)
    derived = derived[where_good_signals].copy()

    # Compute signal_type in df_raw.
    # Map from constellation id to frequencies and signals.
    CONSTEL_FREQ_TABLE = {
        0: {'UNKNOWN': (0, 999999999999)},
        1: {
            'GPS_L1': (1563000000, 1587000000),
            'GPS_L2': (1215000000, 1240000000),
            'GPS_L5': (1164000000, 1189000000)
        },
        3: {
            'GLO_G1': (1593000000, 1610000000),
            'GLO_G2': (1237000000, 1254000000)
        },
        4: {
            'QZS_J1': (1563000000, 1587000000),
            'QZS_J2': (1215000000, 1240000000),
            'QZS_J5': (1164000000, 1189000000)
        },
        5: {
            'BDS_B1C': (1569000000, 1583000000),
            'BDS_B1I': (1553000000, 1568990000),
            'BDS_B2A': (1164000000, 1189000000),
            'BDS_B2B': (1189000000, 1225000000)
        },
        6: {
            'GAL_E1': (1559000000, 1591000000),
            'GAL_E5A': (1164000000, 1189000000),
            'GAL_E5B': (1189000000, 1218000000),
            'GAL_E6': (1258000000, 1300000000)
        },
        7: {
            'IRN_S': (2472000000, 2512000000),
            'IRN_L5': (1164000000, 1189000000)
        },
    }

    def SignalTypeFromConstellationAndFequency(constel, freq_hz):
        'Returns the signal type as a string for the given constellation and frequency.'
        freqs = CONSTEL_FREQ_TABLE.get(constel, {})
        for id_freq_range in freqs.items():
            rng = id_freq_range[1]
            if rng[0] <= freq_hz <= rng[1]:
                return id_freq_range[0]
        return 'UNKNOWN'

    signal_types = itertools.chain(*[c.keys() for c in CONSTEL_FREQ_TABLE.values()])
    sig_type_cat = pd.api.types.CategoricalDtype(categories=signal_types)
    raw['signalType'] = raw.apply(lambda r: SignalTypeFromConstellationAndFequency(r.ConstellationType, r.CarrierFrequencyHz), axis=1).astype(sig_type_cat)

    # Fix QZS Svids issue. 

    # The SVID of any QZS sat in derived may be changed. Since it may be a many to one relationship, we'll need to adjust the values in Raw.
    new_to_old = {1:(183, 193), 2:(184, 194, 196), 3:(187, 189, 197, 199), 4:(185, 195, 200)}
    # Maps original svid to new svid for only ConstellationType=4.
    old_to_new={}
    for new_svid, old_svids in new_to_old.items():
        for s in old_svids:
            old_to_new[s] = new_svid
    raw['svid'] = raw.apply(lambda r: old_to_new.get(r.Svid, r.Svid) if r.ConstellationType == 4 else r.Svid, axis=1)
    del raw['collectionName']
    del raw['phoneName']
    
    derived = derived.merge(raw, on=['millisSinceGpsEpoch', 'svid', 'signalType'], how='left')
    
    # status
    status['millisSinceGpsEpoch'] = status['UnixTimeMillis'] - 315964800000 + 18000
    status['svid'] = status.apply(lambda r: old_to_new.get(r.Svid, r.Svid) if r.ConstellationType == 4 else r.Svid, axis=1)
    status['signalType'] = status.apply(lambda r: SignalTypeFromConstellationAndFequency(r.ConstellationType, r.CarrierFrequencyHz), axis=1).astype(sig_type_cat)
    del status['collectionName']
    del status['phoneName']
    del status['Cn0DbHz']
    status = status.drop_duplicates(subset=['svid', 'signalType', 'millisSinceGpsEpoch'])
    
    sv_sig = derived[['svid', 'signalType']].drop_duplicates()
    output_df = pd.DataFrame()
    for svid, signal in zip(sv_sig['svid'], sv_sig['signalType']):
        derived_tmp = derived[(derived['svid']==svid)&(derived['signalType']==signal)].copy()
        status_tmp = status[(status['svid']==svid)&(status['signalType']==signal)].copy()
        del status_tmp['svid']
        del status_tmp['signalType']
        output_tmp = pd.merge_asof(derived_tmp.sort_values('millisSinceGpsEpoch'), status_tmp.sort_values('millisSinceGpsEpoch'), on='millisSinceGpsEpoch', direction='nearest')
        output_df = output_df.append(output_tmp)
    
    return output_df

In [18]:
def calc_baseline(args):
    s_th = 10
    
    phone, df = args
    collection = phone.split('_')[0]
    phonename = phone.split('_')[1]
    derived = get_derived_data('train', collection, phonename)
    derived = prepare_calc_baseline(derived)
    
    derived = derived[derived['ElevationDegrees']>=elev_deg].copy()
    derived = derived[derived['UsedInFix']>=used_in_fix].copy()
    derived = derived[derived['rawPrUncM']<=rawpruncm].copy()
    
    idx = list(df.index)
    s_list = []
    n_list = []
    lat_list = []
    lng_list = []
    unc_mean_list = []
    unc_max_list = []
    
    for j,i in enumerate(idx):
        s = df.at[i, 'millisSinceGpsEpoch']
        tmp = derived[(derived['millisSinceGpsEpoch']>=s-s_th)&(derived['millisSinceGpsEpoch']<=s+s_th)].copy()
        n = tmp['svid'].nunique()
        s_list.append(s)
        n_list.append(n)
        
        if n < 4:    
            lat_list.append(np.nan)
            lng_list.append(np.nan)
            unc_mean_list.append(np.nan)
            unc_max_list.append(np.nan)        
        
        else:
            res = calc_baseline_point(tmp)
            lat_list.append(res[0])
            lng_list.append(res[1])
            unc_mean_list.append(tmp['uncertaintyWeight'].mean())
            unc_max_list.append(tmp['uncertaintyWeight'].max())
    
    output_df = pd.DataFrame()
    output_df['millisSinceGpsEpoch'] = s_list
    output_df['latDeg'] = lat_list
    output_df['lngDeg'] = lng_list
    output_df['n'] = n_list
    output_df['unc_mean'] = unc_mean_list
    output_df['unc_max'] = unc_max_list
    output_df['collectionName'] = collection
    output_df['phoneName'] = phonename
    output_df['phone'] = phone
    
    return output_df

In [19]:
def calc_baseline_test(args):
    s_th = 10
    
    phone, df = args
    collection = phone.split('_')[0]
    phonename = phone.split('_')[1]
    derived = get_derived_data('test', collection, phonename)
    derived = prepare_calc_baseline(derived)
    
    derived = derived[derived['ElevationDegrees']>=elev_deg].copy()
    derived = derived[derived['UsedInFix']>=used_in_fix].copy()
    derived = derived[derived['rawPrUncM']<=rawpruncm].copy()
    
    idx = list(df.index)
    s_list = []
    n_list = []
    lat_list = []
    lng_list = []
    unc_mean_list = []
    unc_max_list = []
    
    for j,i in enumerate(idx):
        s = df.at[i, 'millisSinceGpsEpoch']
        tmp = derived[(derived['millisSinceGpsEpoch']>=s-s_th)&(derived['millisSinceGpsEpoch']<=s+s_th)].copy()
        n = tmp['svid'].nunique()
        s_list.append(s)
        n_list.append(n)
        
        if n < 4:    
            lat_list.append(np.nan)
            lng_list.append(np.nan)
            unc_mean_list.append(np.nan)
            unc_max_list.append(np.nan)        
        
        else:
            res = calc_baseline_point(tmp)
            lat_list.append(res[0])
            lng_list.append(res[1])
            unc_mean_list.append(tmp['uncertaintyWeight'].mean())
            unc_max_list.append(tmp['uncertaintyWeight'].max())
    
    output_df = pd.DataFrame()
    output_df['millisSinceGpsEpoch'] = s_list
    output_df['latDeg'] = lat_list
    output_df['lngDeg'] = lng_list
    output_df['n'] = n_list
    output_df['unc_mean'] = unc_mean_list
    output_df['unc_max'] = unc_max_list
    output_df['collectionName'] = collection
    output_df['phoneName'] = phonename
    output_df['phone'] = phone
    
    return output_df

In [20]:
def carrier_smoothing(df, rel_rate):
    abs_rate = 1 - rel_rate
    df = df.sort_values(['sat', 'millisSinceGpsEpoch'])
    df = df.reset_index(drop=True)
    
    df.loc[df['AccumulatedDeltaRangeState']!=25, 'AccumulatedDeltaRangeMeters'] = np.nan
    df['ADR_d'] = df['AccumulatedDeltaRangeMeters'] - df.groupby('sat')['AccumulatedDeltaRangeMeters'].shift(1)
    df['ADR_d_prev'] = df['ADR_d'].shift(-1)
    
    df_index = list(df.index)
    for idx in df_index:
        if idx == df.index[-1]-1:
            break
        if df.at[idx, 'sat'] != df.at[idx+1, 'sat']:
            continue
            
        p_abs = df.at[idx, 'correctedPrM']
        p_adrd = df.at[idx, 'ADR_d']
        
        if not np.isnan(p_adrd):
            p_abs_prev = df.at[idx-1, 'correctedPrM']
            p_rel = p_abs_prev + p_adrd
            p_new = p_abs * abs_rate + p_rel * rel_rate
            df.at[idx, 'correctedPrM'] = p_new            

    df_index.reverse()
    for idx in df_index:
        if idx == df.index[0]:
            break
        if df.at[idx, 'sat'] != df.at[idx-1, 'sat']:
            continue

        p_abs = df.at[idx, 'correctedPrM']
        p_adrd = df.at[idx, 'ADR_d_prev']

        if not np.isnan(p_adrd):
            p_abs_prev = df.at[idx+1, 'correctedPrM']
            p_rel = p_abs_prev - p_adrd
            p_new = p_abs * abs_rate + p_rel * rel_rate
            df.at[idx, 'correctedPrM'] = p_new                
            
    return df        

In [21]:
#df = get_derived_data('train', '2021-04-28-US-MTV-1', 'Pixel5')

In [22]:
train, test, sub, gt = get_data()

In [23]:
train = train[train['collectionName'].isin(target)].copy()
train = train[train['phoneName']!='Mi8'].copy()

In [24]:
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(calc_baseline, train.groupby('phone'))
    dfs = tqdm(dfs)
    dfs = list(dfs)
result = pd.concat(dfs)

0it [00:00, ?it/s]

In [25]:
train_tmp = train[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']].copy()
train_tmp.columns = ['phone', 'millisSinceGpsEpoch', 'latDeg_bl', 'lngDeg_bl']
result = result.merge(train_tmp, on=['phone', 'millisSinceGpsEpoch'], how='left')

In [26]:
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
gt_tmp = gt[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']].copy()
gt_tmp.columns = ['phone', 'millisSinceGpsEpoch', 'latDeg_gt', 'lngDeg_gt']
result = result.merge(gt_tmp, on=['phone', 'millisSinceGpsEpoch'], how='left')

In [27]:
result['rb_bl_err'] = calc_haversine(result['latDeg_bl'], result['lngDeg_bl'], result['latDeg'], result['lngDeg'])
result['rb_gt_err'] = calc_haversine(result['latDeg_gt'], result['lngDeg_gt'], result['latDeg'], result['lngDeg'])
result['bl_gt_err'] = calc_haversine(result['latDeg_gt'], result['lngDeg_gt'], result['latDeg_bl'], result['lngDeg_bl'])

In [28]:
for phone in result['phone'].unique():
    tmp = result[result['phone']==phone].copy()
    null_rate = tmp['latDeg'].isnull().sum() / len(tmp)
    print(phone, null_rate)

2020-05-14-US-MTV-2_Pixel4XLModded 0.01386481802426343
2020-06-05-US-MTV-1_Pixel4XLModded 0.0017809439002671415
2020-07-08-US-MTV-1_Pixel4XLModded 0.0016051364365971107
2020-09-04-US-SF-2_Pixel4XL 0.021445591739475776
2020-08-06-US-MTV-2_Pixel4XL 0.018306636155606407
2020-06-11-US-MTV-1_Pixel4XL 0.03269447576099211
2020-06-04-US-MTV-1_Pixel4XL 0.0029291154071470417
2020-08-06-US-MTV-2_Pixel4 0.0463428252372976
2020-06-04-US-MTV-1_Pixel4 0.002918855808523059
2020-09-04-US-SF-1_Pixel4XL 0.0011527377521613833
2020-05-21-US-MTV-2_Pixel4XL 0.028985507246376812
2020-06-04-US-MTV-1_Pixel4XLModded 0.0288135593220339
2020-09-04-US-SF-1_Pixel4 0.0022896393817973667
2020-06-05-US-MTV-2_Pixel4XL 0.003418803418803419
2020-05-14-US-MTV-2_Pixel4 0.0011299435028248588
2020-05-14-US-MTV-1_Pixel4XLModded 0.001145475372279496
2020-07-08-US-MTV-1_Pixel4XL 0.025133689839572194
2020-06-05-US-MTV-2_Pixel4 0.0022148394241417496
2020-05-14-US-MTV-1_Pixel4 0.0011494252873563218
2020-06-11-US-MTV-1_Pixel4 0.0201

In [29]:
result_grouped = result.dropna().groupby('phone')[['rb_bl_err', 'rb_gt_err', 'bl_gt_err']].agg([percentile50, percentile95]).reset_index()
result_grouped.columns = ['phone', 'rb_bl_err_p50', 'rb_bl_err_p95', 'rb_gt_err_p50', 'rb_gt_err_p95', 'bl_gt_err_p50', 'bl_gt_err_p95']
result_grouped['rb_score'] = (result_grouped['rb_gt_err_p95'] + result_grouped['rb_gt_err_p50']) / 2
result_grouped['bl_score'] = (result_grouped['bl_gt_err_p95'] + result_grouped['bl_gt_err_p50']) / 2
result_grouped

Unnamed: 0,phone,rb_bl_err_p50,rb_bl_err_p95,rb_gt_err_p50,rb_gt_err_p95,bl_gt_err_p50,bl_gt_err_p95,rb_score,bl_score
0,2020-05-14-US-MTV-1_Pixel4,0.710621,2.075249,1.116076,2.40071,1.313621,2.88798,1.758393,2.100801
1,2020-05-14-US-MTV-1_Pixel4XLModded,0.864512,3.021406,1.689839,3.678645,1.985987,4.3426,2.684242,3.164294
2,2020-05-14-US-MTV-2_Pixel4,0.668062,1.806736,1.522549,2.721156,1.367036,2.789667,2.121853,2.078351
3,2020-05-14-US-MTV-2_Pixel4XLModded,1.537603,7.000772,3.355658,9.510154,3.320089,10.456446,6.432906,6.888267
4,2020-05-21-US-MTV-1_Pixel4,0.849552,2.735504,1.954534,4.76468,1.822562,4.653667,3.359607,3.238114
5,2020-05-21-US-MTV-2_Pixel4,0.619489,2.032436,0.897327,2.804995,1.006633,3.232788,1.851161,2.11971
6,2020-05-21-US-MTV-2_Pixel4XL,0.768932,2.753715,1.085725,3.421885,1.164092,3.391852,2.253805,2.277972
7,2020-05-29-US-MTV-1_Pixel4,0.877172,2.70344,2.485724,4.290432,2.628013,4.712513,3.388078,3.670263
8,2020-05-29-US-MTV-1_Pixel4XL,0.559837,1.698374,2.235506,3.382404,2.326845,3.832572,2.808955,3.079709
9,2020-05-29-US-MTV-1_Pixel4XLModded,0.732753,2.482697,2.02932,3.839202,2.278249,4.122715,2.934261,3.200482


In [30]:
result.to_csv(OUTPUT + '/result.csv', index=False)
result_grouped.to_csv(OUTPUT + '/result_grouped.csv', index=False)

In [31]:
result_grouped[['rb_score', 'bl_score']].mean()

rb_score    3.003532
bl_score    3.144246
dtype: float64

# test

In [32]:
test = test[test['collectionName'].isin(target)].copy()
test = test[test['phoneName']!='Mi8'].copy()

In [33]:
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(calc_baseline_test, test.groupby('phone'))
    dfs = tqdm(dfs)
    dfs = list(dfs)
result = pd.concat(dfs)

0it [00:00, ?it/s]

In [34]:
result.to_csv(OUTPUT + '/result_test.csv', index=False)

In [35]:
for phone in result['phone'].unique():
    tmp = result[result['phone']==phone].copy()
    null_rate = tmp['latDeg'].isnull().sum() / len(tmp)
    print(phone, null_rate)

2020-05-28-US-MTV-2_Pixel4XLModded 0.0006858710562414266
2020-08-03-US-MTV-2_Pixel4XL 0.10672169811320754
2020-08-03-US-MTV-2_Pixel4 0.14269340974212033
2020-06-04-US-MTV-2_Pixel4XL 0.0023515579071134627
2020-06-10-US-MTV-1_Pixel4XLModded 0.030124040165386886
2020-06-04-US-MTV-2_Pixel4XLModded 0.006462984723854289
2020-06-10-US-MTV-1_Pixel4XL 0.0024067388688327317
2020-06-10-US-MTV-1_Pixel4 0.0018039687312086591
2020-06-04-US-MTV-2_Pixel4 0.0023584905660377358
2020-06-10-US-MTV-2_Pixel4 0.006688963210702341
2020-06-10-US-MTV-2_Pixel4XL 0.003878116343490305
2020-06-10-US-MTV-2_Pixel4XLModded 0.027361899845121322
2020-08-13-US-MTV-1_Pixel4 0.18307349665924277
2020-05-28-US-MTV-2_Pixel4 0.008628127696289905
2021-03-16-US-MTV-2_Pixel4Modded 0.0004899559039686428
2020-05-28-US-MTV-1_Pixel4 0.0026595744680851063
2020-05-28-US-MTV-2_Pixel4XL 0.029449978345604158
2021-03-16-US-MTV-2_SamsungS20Ultra 0.0004299226139294927
2020-05-28-US-MTV-1_Pixel4XL 0.024319868095630668
2020-05-15-US-MTV-1_Pixe