# 导入工具包

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# 读取数据

In [2]:
dir_path = '../input/google-smartphone-decimeter-challenge'

In [3]:
file_train = os.path.join(dir_path, "baseline_locations_train.csv")
file_test = os.path.join(dir_path, "baseline_locations_test.csv")
file_sub = os.path.join(dir_path, "sample_submission.csv")

In [4]:
file_train

'../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv'

In [5]:
data_train = pd.read_csv(file_train)
data_test = pd.read_csv(file_test)
data_sub = pd.read_csv(file_sub)

In [6]:
data_train.head(3)

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4


In [7]:
path_data = Path(dir_path)
path_truth_data = (path_data/'train').rglob('ground_truth.csv')

In [8]:
path_data 

PosixPath('../input/google-smartphone-decimeter-challenge')

In [9]:
# list(path_truth_data)

In [10]:
data_truth_list =[]
for file_name in tqdm(path_truth_data, total=73):
    data_file = pd.read_csv(file_name)
    data_truth_list.append(data_file)

100%|██████████| 73/73 [00:01<00:00, 55.63it/s]


In [11]:
# data_truth_list[:1]

In [12]:
data_train_truth = pd.concat(data_truth_list, ignore_index=True)

In [13]:
data_train_truth.head(3)

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree
0,2020-07-17-US-MTV-1,Mi8,1279059935000,37.428281,-122.072541,34.71,669.0,0.8,0.0,0.0,242.8
1,2020-07-17-US-MTV-1,Mi8,1279059936000,37.428281,-122.072541,34.71,670.0,0.8,0.0,0.0,242.8
2,2020-07-17-US-MTV-1,Mi8,1279059937000,37.428281,-122.072541,34.71,671.0,0.8,0.0,0.0,242.8


In [14]:
len(data_train), len(data_test), len(data_train_truth)

(131342, 91486, 131342)

In [15]:
data_train_truth.columns

Index(['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg',
       'lngDeg', 'heightAboveWgs84EllipsoidM', 'timeSinceFirstFixSeconds',
       'hDop', 'vDop', 'speedMps', 'courseDegree'],
      dtype='object')

In [16]:
set(data_train_truth.columns) & set(data_train.columns)

{'collectionName',
 'heightAboveWgs84EllipsoidM',
 'latDeg',
 'lngDeg',
 'millisSinceGpsEpoch',
 'phoneName'}

# 合并数据

In [17]:
train_columns = ['collectionName', 'phoneName', 'millisSinceGpsEpoch','latDeg','lngDeg']
merge_columns = ['collectionName', 'phoneName', 'millisSinceGpsEpoch']
pd_train = data_train_truth[train_columns].merge(data_train,
                       on=merge_columns,
                      suffixes=("_truth",""))

In [18]:
pd_test = data_test

In [19]:
# del pd_train['phone']
# del pd_test['phone']

In [20]:
len(pd_train), len(pd_test)

(131342, 91486)

In [21]:
pd_train.head(3)

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg_truth,lngDeg_truth,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-07-17-US-MTV-1,Mi8,1279059935000,37.428281,-122.072541,37.428288,-122.072583,-16.14,2020-07-17-US-MTV-1_Mi8
1,2020-07-17-US-MTV-1,Mi8,1279059936000,37.428281,-122.072541,37.428271,-122.072521,-24.96,2020-07-17-US-MTV-1_Mi8
2,2020-07-17-US-MTV-1,Mi8,1279059937000,37.428281,-122.072541,37.428234,-122.072572,-24.38,2020-07-17-US-MTV-1_Mi8


In [22]:
# 手机，t1, 1
# 手机，t2, 2
# 手机，t3, 3



# 手机，t1, nan
# 手机，t2, 1
# 手机，t3, 2

# 按时间排序

In [23]:
pd_train = pd_train.sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch'])

In [24]:
pd_test = pd_test.sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch'])

In [25]:
len(pd_train.drop_duplicates(['collectionName','phoneName','latDeg_truth','lngDeg_truth']))

118624

In [26]:
len(pd_train.drop_duplicates(['collectionName','latDeg_truth','lngDeg_truth']))

118624

In [27]:
len(pd_train)

131342

In [28]:
len(set(pd_train['millisSinceGpsEpoch']))

128295

In [29]:
len(set(pd_train['phoneName']))

7

In [30]:
len(set(pd_train['collectionName']))

29

In [31]:
len(pd_test)

91486

# 特征工程

In [32]:
pd_test.columns

Index(['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg',
       'lngDeg', 'heightAboveWgs84EllipsoidM', 'phone'],
      dtype='object')

In [33]:
pd_train.head(3)

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg_truth,lngDeg_truth,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
9124,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423576,-122.094132,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
9125,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423576,-122.094132,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
9126,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423576,-122.094132,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4


In [34]:
pd_test.head(3)

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.08204,-31.76,2020-05-15-US-MTV-1_Pixel4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416652,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4


## 类别特征

In [35]:
set(pd_test['collectionName'])

{'2020-05-15-US-MTV-1',
 '2020-05-28-US-MTV-1',
 '2020-05-28-US-MTV-2',
 '2020-06-04-US-MTV-2',
 '2020-06-10-US-MTV-1',
 '2020-06-10-US-MTV-2',
 '2020-08-03-US-MTV-2',
 '2020-08-13-US-MTV-1',
 '2021-03-16-US-MTV-2',
 '2021-03-16-US-RWC-2',
 '2021-03-25-US-PAO-1',
 '2021-04-02-US-SJC-1',
 '2021-04-08-US-MTV-1',
 '2021-04-21-US-MTV-1',
 '2021-04-22-US-SJC-2',
 '2021-04-26-US-SVL-2',
 '2021-04-28-US-MTV-2',
 '2021-04-29-US-MTV-2',
 '2021-04-29-US-SJC-3'}

In [36]:
dict_ = dict(zip(set(pd_test['collectionName']), range(len(set(pd_test['collectionName'])))))
pd_train['collectionName_cat01'] = pd_train['collectionName'].map(dict_)
pd_test['collectionName_cat01'] = pd_test['collectionName'].map(dict_)

In [37]:
dict_

{'2021-04-21-US-MTV-1': 0,
 '2020-08-13-US-MTV-1': 1,
 '2020-05-28-US-MTV-1': 2,
 '2020-06-10-US-MTV-1': 3,
 '2021-04-26-US-SVL-2': 4,
 '2021-03-25-US-PAO-1': 5,
 '2020-06-10-US-MTV-2': 6,
 '2021-03-16-US-RWC-2': 7,
 '2021-04-28-US-MTV-2': 8,
 '2020-08-03-US-MTV-2': 9,
 '2021-04-02-US-SJC-1': 10,
 '2020-05-15-US-MTV-1': 11,
 '2021-04-08-US-MTV-1': 12,
 '2021-03-16-US-MTV-2': 13,
 '2021-04-22-US-SJC-2': 14,
 '2020-05-28-US-MTV-2': 15,
 '2021-04-29-US-MTV-2': 16,
 '2020-06-04-US-MTV-2': 17,
 '2021-04-29-US-SJC-3': 18}

In [38]:
len(pd_train), len(pd_test)

(131342, 91486)

In [39]:
set(pd_test['phoneName'])

{'Mi8',
 'Pixel4',
 'Pixel4Modded',
 'Pixel4XL',
 'Pixel4XLModded',
 'Pixel5',
 'SamsungS20Ultra'}

In [40]:
# len(list('SamsungS20Ultra'))

In [41]:
dict_ = dict(zip(set(pd_test['phoneName']), range(len(set(pd_test['phoneName'])))))
pd_train['phoneName_cat01'] = pd_train['phoneName'].map(dict_)
pd_test['phoneName_cat01'] = pd_test['phoneName'].map(dict_)

In [42]:
dict_

{'Pixel4Modded': 0,
 'Mi8': 1,
 'SamsungS20Ultra': 2,
 'Pixel5': 3,
 'Pixel4': 4,
 'Pixel4XLModded': 5,
 'Pixel4XL': 6}

In [43]:
pd_test

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,collectionName_cat01,phoneName_cat01
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4,11,4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.082040,-31.76,2020-05-15-US-MTV-1_Pixel4,11,4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416652,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4,11,4
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4,11,4
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4,11,4
...,...,...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334539,-121.899383,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra,18,2
91482,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334545,-121.899380,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra,18,2
91483,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334551,-121.899371,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra,18,2
91484,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334540,-121.899371,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra,18,2


In [44]:
len(pd_train), len(pd_test)

(131342, 91486)

## shift做特征

In [45]:
shift_list = [-6,-5,-4,-3,-2,-1,1,2,3,4,5,6]
shift_columns = ['millisSinceGpsEpoch','latDeg','lngDeg','heightAboveWgs84EllipsoidM']
for shift_i in shift_list:
    
    tmp_test = pd_test.groupby(['collectionName','phoneName']).shift(shift_i)
    for col in shift_columns:
        pd_test[col+'_shift_'+str(shift_i)] =  tmp_test[col]
        
    tmp_train = pd_train.groupby(['collectionName','phoneName']).shift(shift_i)
    for col in shift_columns:
        pd_train[col+'_shift_'+str(shift_i)] =  tmp_train[col]

In [46]:
pd_train

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg_truth,lngDeg_truth,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,collectionName_cat01,...,lngDeg_shift_4,heightAboveWgs84EllipsoidM_shift_4,millisSinceGpsEpoch_shift_5,latDeg_shift_5,lngDeg_shift_5,heightAboveWgs84EllipsoidM_shift_5,millisSinceGpsEpoch_shift_6,latDeg_shift_6,lngDeg_shift_6,heightAboveWgs84EllipsoidM_shift_6
9124,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423576,-122.094132,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,,...,,,,,,,,,,
9125,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423576,-122.094132,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,,...,,,,,,,,,,
9126,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423576,-122.094132,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,,...,,,,,,,,,,
9127,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423576,-122.094132,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,,...,,,,,,,,,,
9128,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423576,-122.094132,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4,,...,-122.094091,-34.06,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334475,-121.899613,37.334460,-121.899600,-8.09,2021-04-29-US-SJC-2_SamsungS20Ultra,,...,-121.899615,-6.06,1.303760e+12,37.334488,-121.899626,-5.10,1.303760e+12,37.334487,-121.899641,-10.10
54338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334475,-121.899613,37.334472,-121.899583,-7.59,2021-04-29-US-SJC-2_SamsungS20Ultra,,...,-121.899603,-7.62,1.303760e+12,37.334466,-121.899615,-6.06,1.303760e+12,37.334488,-121.899626,-5.10
54339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334475,-121.899613,37.334491,-121.899597,-8.35,2021-04-29-US-SJC-2_SamsungS20Ultra,,...,-121.899587,-8.44,1.303760e+12,37.334468,-121.899603,-7.62,1.303760e+12,37.334466,-121.899615,-6.06
54340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334475,-121.899613,37.334495,-121.899583,-8.73,2021-04-29-US-SJC-2_SamsungS20Ultra,,...,-121.899610,-5.91,1.303760e+12,37.334468,-121.899587,-8.44,1.303760e+12,37.334468,-121.899603,-7.62


In [47]:
len(pd_train), len(pd_test)

(131342, 91486)

In [48]:
len(pd_train.columns), len(pd_test.columns)

(59, 57)

## 基本groupby特征

In [49]:
groupby_columns = ['millisSinceGpsEpoch']
values_columns = ['latDeg','lngDeg','heightAboveWgs84EllipsoidM']
function_names = ['sum','mean','size','min','max']
data_list = [pd_train, pd_test]
data_out = []
for data in data_list:
    tmp = data.\
    groupby(groupby_columns)[values_columns].\
    agg(function_names).\
    reset_index()

    tmp_columns = groupby_columns.copy()
    for i in values_columns:
        for j in function_names:
            tmp_columns.append('_'.join(groupby_columns) + '_' + i + '_' + j )

    tmp.columns = tmp_columns
    
    data = data.merge(tmp,on=groupby_columns)
    data_out.append(data)
pd_train, pd_test = data_out

In [50]:
len(pd_train), len(pd_test)

(131342, 91486)

In [51]:
len(pd_train.columns), len(pd_test.columns)

(74, 72)

In [52]:
groupby_columns = ['collectionName']
values_columns = ['latDeg','lngDeg','heightAboveWgs84EllipsoidM']
function_names = ['sum','mean','size','min','max']
data_list = [pd_train, pd_test]
data_out = []
for data in data_list:
    tmp = data.\
    groupby(groupby_columns)[values_columns].\
    agg(function_names).\
    reset_index()

    tmp_columns = groupby_columns.copy()
    for i in values_columns:
        for j in function_names:
            tmp_columns.append('_'.join(groupby_columns) + '_' + i + '_' + j )

    tmp.columns = tmp_columns
    
    data = data.merge(tmp,on=groupby_columns)
    data_out.append(data)
pd_train, pd_test = data_out

In [53]:
len(pd_train), len(pd_test)

(131342, 91486)

In [54]:
len(pd_train.columns), len(pd_test.columns)

(89, 87)

In [55]:
groupby_columns = ['collectionName','phoneName']
values_columns = ['latDeg','lngDeg','heightAboveWgs84EllipsoidM']
function_names = ['sum','mean','size','min','max']
data_list = [pd_train, pd_test]
data_out = []
for data in data_list:
    tmp = data.\
    groupby(groupby_columns)[values_columns].\
    agg(function_names).\
    reset_index()

    tmp_columns = groupby_columns.copy()
    for i in values_columns:
        for j in function_names:
            tmp_columns.append('_'.join(groupby_columns) + '_' + i + '_' + j )

    tmp.columns = tmp_columns
    
    data = data.merge(tmp,on=groupby_columns)
    data_out.append(data)
pd_train, pd_test = data_out

In [56]:
len(pd_train), len(pd_test)

(131342, 91486)

In [57]:
len(pd_train.columns), len(pd_test.columns)

(104, 102)

In [58]:
groupby_columns = ['phoneName']
values_columns = ['latDeg','lngDeg','heightAboveWgs84EllipsoidM']
function_names = ['sum','mean','size','min','max']
data_list = [pd_train, pd_test]
data_out = []
for data in data_list:
    tmp = data.\
    groupby(groupby_columns)[values_columns].\
    agg(function_names).\
    reset_index()

    tmp_columns = groupby_columns.copy()
    for i in values_columns:
        for j in function_names:
            tmp_columns.append('_'.join(groupby_columns) + '_' + i + '_' + j )

    tmp.columns = tmp_columns
    
    data = data.merge(tmp,on=groupby_columns)
    data_out.append(data)
pd_train, pd_test = data_out

In [59]:
len(pd_train), len(pd_test)

(131342, 91486)

In [60]:
len(pd_train.columns), len(pd_test.columns)

(119, 117)

## 滑动平均特征

In [61]:
# df.rolling(3, center=True).mean()
# df.rolling(3, center=True).min()
# df.rolling(3, center=True).max()
# df.rolling(3, center=True).sum()
# DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None)

In [62]:
len(pd_train), len(pd_test)

(131342, 91486)

In [63]:
# pd_train

In [64]:
groupby_columns = ['collectionName','phoneName']
rolling_columns = ['latDeg','lngDeg','heightAboveWgs84EllipsoidM']
window_list = [2,3,5,7,10]
for i in window_list:
    for data in [pd_train, pd_test]:
        temp = data.groupby(groupby_columns)[rolling_columns].\
        rolling(i, center=True).mean().reset_index()
        for col in rolling_columns:
            data[col+'_rolling_'+str(i)] = temp[col]

In [65]:
len(pd_train), len(pd_test)

(131342, 91486)

In [66]:
len(pd_train.columns), len(pd_test.columns)

(134, 132)

# 数据切分

In [67]:
len(list(pd_train.columns))

134

In [68]:
len(set(pd_train.columns) & set(pd_test.columns))

132

In [69]:
remove_columns = ['collectionName','phoneName','latDeg_truth','lngDeg_truth','phone']
features_columns = [col for col in pd_test.columns if col not in remove_columns]

In [70]:
X_train = pd_train[features_columns]
X_test = pd_test[features_columns]
y_latDeg = pd_train['latDeg_truth']
y_lngDeg = pd_train['lngDeg_truth']

In [71]:
xtr1,xval1,ytr1,yval1 = train_test_split(X_train, y_latDeg, test_size=0.3, random_state=10)
xtr2,xval2,ytr2,yval2 = train_test_split(X_train, y_lngDeg, test_size=0.3, random_state=10)

# 模型训练

In [72]:
params = {
    'objective': 'mae',
    'max_bin': 600,
    'learning_rate': 0.02,
    'num_leaves': 80
}

lgb_train = lgb.Dataset(xtr1, ytr1)
lgb_eval = lgb.Dataset(xval1, yval1, reference=lgb_train)

model_latDeg = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=25,
    num_boost_round=10000,
    early_stopping_rounds=10
)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49093
[LightGBM] [Info] Number of data points in the train set: 91939, number of used features: 128
[LightGBM] [Info] Start training from score 37.424355
Training until validation scores don't improve for 10 rounds
[25]	training's l1: 0.0422959	valid_1's l1: 0.0416455
[50]	training's l1: 0.0265275	valid_1's l1: 0.0259662
[75]	training's l1: 0.0154065	valid_1's l1: 0.0149869
[100]	training's l1: 0.00888586	valid_1's l1: 0.00860898
[125]	training's l1: 0.00513338	valid_1's l1: 0.00497586
[150]	training's l1: 0.00312256	valid_1's l1: 0.00302218
[175]	training's l1: 0.00201187	valid_1's l1: 0.00195115
[200]	training's l1: 0.00137416	valid_1's l1: 0.00133127
[225]	training's l1: 0.00100375	valid_1's l1: 0.000971441
[250]	training's l1: 0.000768969	valid_1's l1: 0.000743638
[275]	training's l1: 0.00060278	valid_1's l1: 0.000582212
[300]	training's l1: 0.000498523	valid_1's l1: 0.000480875
[325]	training's 

In [73]:
params = {
    'objective': 'mae',
    'max_bin': 600,
    'learning_rate': 0.02,
    'num_leaves': 80
}

lgb_train = lgb.Dataset(xtr2, ytr2)
lgb_eval = lgb.Dataset(xval2, yval2, reference=lgb_train)

model_lngDeg = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=25,
    num_boost_round=10000,
    early_stopping_rounds=10
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49093
[LightGBM] [Info] Number of data points in the train set: 91939, number of used features: 128
[LightGBM] [Info] Start training from score -122.119110
Training until validation scores don't improve for 10 rounds
[25]	training's l1: 0.0815119	valid_1's l1: 0.080672
[50]	training's l1: 0.0592431	valid_1's l1: 0.058579
[75]	training's l1: 0.0456765	valid_1's l1: 0.0451537
[100]	training's l1: 0.0285658	valid_1's l1: 0.0281426
[125]	training's l1: 0.0171685	valid_1's l1: 0.0168716
[150]	training's l1: 0.0101317	valid_1's l1: 0.0099371
[175]	training's l1: 0.00613213	valid_1's l1: 0.0060122
[200]	training's l1: 0.00366092	valid_1's l1: 0.00360639
[225]	training's l1: 0.00215314	valid_1's l1: 0.00213878
[250]	training's l1: 0.00134747	valid_1's l1: 0.0013459
[275]	training's l1: 0.000887057	valid_1's l1: 0.000886974
[300]	training's l1: 0.000612808	valid_1's l1: 0.0006126
[325]	training's l1: 0.000452

# 模型验证

In [74]:
y_pred = model_latDeg.predict(xval1)
y_true = np.array(yval1)
mean_absolute_error(y_true, y_pred)

9.011009998305294e-05

In [75]:
y_pred = model_lngDeg.predict(xval1)
y_true = np.array(yval2)
mean_absolute_error(y_true, y_pred)

0.0001654727777542729

In [76]:
# ytr2,yval2

# 模型预测

In [77]:
y_latDeg = model_latDeg.predict(X_test)

In [78]:
y_lngDeg = model_lngDeg.predict(X_test)

# 提交结果

In [79]:
submission = pd_test[['phone','millisSinceGpsEpoch']]
submission['latDeg'] = list(y_latDeg)
submission['lngDeg'] = list(y_lngDeg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [80]:
df_sub = data_sub[['phone', 'millisSinceGpsEpoch']].merge(submission,on=['phone', 'millisSinceGpsEpoch'],how='inner')

In [81]:
df_sub.to_csv("./submission.csv",index=False)

In [82]:
!head ./submission.csv

phone,millisSinceGpsEpoch,latDeg,lngDeg
2020-05-15-US-MTV-1_Pixel4,1273608785432,37.41653180109113,-122.08180662657895
2020-05-15-US-MTV-1_Pixel4,1273608786432,37.41641363496,-122.08189289766551
2020-05-15-US-MTV-1_Pixel4,1273608787432,37.41656116216651,-122.08202282966545
2020-05-15-US-MTV-1_Pixel4,1273608788432,37.41657264197658,-122.08202296028668
2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416580059782554,-122.0820016316261
2020-05-15-US-MTV-1_Pixel4,1273608790432,37.416564160973884,-122.08200707242443
2020-05-15-US-MTV-1_Pixel4,1273608791432,37.41657714101826,-122.08201973326467
2020-05-15-US-MTV-1_Pixel4,1273608792432,37.41657986248146,-122.08201975181262
2020-05-15-US-MTV-1_Pixel4,1273608793432,37.41658057149589,-122.08202448103182
