In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from geopy.distance import geodesic
import tensorflow as tf

from sklearn import linear_model, model_selection, metrics

## Load train and test data

In [2]:
# train set
df_mess_train = pd.read_csv('mess_train_list.csv')

# test set
df_mess_test = pd.read_csv('mess_test_list.csv')



In [3]:
df_mess_train["datetime"]=pd.to_datetime(df_mess_train.time_ux, unit='ms')
df_mess_test["datetime"]=pd.to_datetime(df_mess_test.time_ux, unit='ms')
df_mess_train.head()


Unnamed: 0,messid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng,datetime
0,573bf1d9864fce1a9af8c5c9,2841,473335.0,0.5,-121.5,1463546000000.0,39.617794,-104.954917,2016-05-18 04:38:49
1,573bf1d9864fce1a9af8c5c9,3526,473335.0,2.0,-125.0,1463546000000.0,39.677251,-104.952721,2016-05-18 04:38:49
2,573bf3533e952e19126b256a,2605,473335.0,1.0,-134.0,1463547000000.0,39.612745,-105.008827,2016-05-18 04:45:07
3,573c0cd0f0fe6e735a699b93,2610,473953.0,2.0,-132.0,1463553000000.0,39.797969,-105.07346,2016-05-18 06:33:52
4,573c0cd0f0fe6e735a699b93,3574,473953.0,1.0,-120.0,1463553000000.0,39.723151,-104.956216,2016-05-18 06:33:52


In [4]:
df_mess_train.dtypes

messid              object
bsid                 int64
did                float64
nseq               float64
rssi               float64
time_ux            float64
bs_lat             float64
bs_lng             float64
datetime    datetime64[ns]
dtype: object

In [5]:
print(df_mess_train.shape)
df_mess_train.describe()

(39250, 9)


Unnamed: 0,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng
count,39250.0,39250.0,39250.0,39250.0,39250.0,39250.0,39250.0
mean,4533.774752,483593.2,1.043541,-125.86358,1473368000000.0,42.554998,-100.969685
std,2570.495638,98931.4,0.56354,9.2265,5506910000.0,7.722787,11.522732
min,879.0,473288.0,0.0,-155.0,1463546000000.0,36.053394,-107.463146
25%,2790.0,476123.0,1.0,-132.5,1468410000000.0,39.747448,-105.074287
50%,3559.0,476315.0,1.0,-127.333333,1473444000000.0,39.793585,-105.043685
75%,7456.0,476609.0,1.5,-121.0,1479118000000.0,39.83874,-104.998429
max,11951.0,1747448.0,2.0,-64.0,1481143000000.0,64.3,-68.5


In [6]:
# position associated to train set
pos_train = pd.read_csv('pos_train_list.csv')
pos_train = pos_train.join(df_mess_train[["messid"]])


# filtering df_mess_train, remove outliers
mask = df_mess_train["bs_lng"] < -80
df_mess_train = df_mess_train[mask]
pos_train = pos_train[mask]

In [7]:
pos_train.messid.nunique()

6068

# Extra datas

In [8]:
humidity = pd.read_csv('humidity.csv')
humidity.datetime = pd.to_datetime(humidity.datetime)
humidity_denver = humidity[['datetime',"Denver"]]
humidity_denver.set_index("datetime", inplace=True)

In [9]:
temperature = pd.read_csv('temperature.csv')
temperature.datetime = pd.to_datetime(temperature.datetime)
temp_denver = temperature[['datetime',"Denver"]]
temp_denver.set_index("datetime", inplace=True)

In [10]:
datetime = df_mess_train.datetime
datetime.drop_duplicates(inplace=True)

In [11]:
city_attributes = pd.read_csv("city_attributes.csv")



In [12]:
def top5(g):
    
    #if len(g["bsid"].unique()) != len(g) :
    #    g=g.groupby("bsid").apply(lambda g: g.sort_values('rssi')[-1:][["did","rssi","nseq","bs_lat","bs_lng"]].reset_index()).reset_index().drop(columns=["level_1","index"])
    
    
    temp= g.dropna().sort_values('rssi')[-5:][["did","rssi","nseq","bs_lat","bs_lng"]]
    zeros = pd.DataFrame([[temp.iloc[0]["did"],0,0,0,0]], columns=["did","rssi","nseq","bs_lat","bs_lng"])
    for i in range(len(temp)-1, 5):    
        temp = pd.concat([temp, zeros])        
    # transform to columns
    temp.reset_index(inplace=True)
    temp["index"] = temp.index
    #print(temp.head())
    
    temp = temp.pivot_table(index=temp.did, 
                                 values=["rssi","nseq","bs_lat","bs_lng"], columns=["index"] ,                                   
                                    fill_value=0)
    temp = temp.reorder_levels([1, 0], axis=1).sort_index(level=0, axis=1).reset_index().drop(columns=["did"])
    return temp
        
    

df_train_group = df_mess_train.groupby(["messid"]).apply(top5)
df_test_group = df_mess_test.groupby(["messid"]).apply(top5)

  new_axis = axis.drop(labels, errors=errors)


In [13]:
df_train_group.head()

Unnamed: 0_level_0,index,0,0,0,0,1,1,1,1,2,2,...,3,3,4,4,4,4,5,5,5,5
Unnamed: 0_level_1,Unnamed: 1_level_1,bs_lat,bs_lng,nseq,rssi,bs_lat,bs_lng,nseq,rssi,bs_lat,bs_lng,...,nseq,rssi,bs_lat,bs_lng,nseq,rssi,bs_lat,bs_lng,nseq,rssi
messid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
573bf1d9864fce1a9af8c5c9,0,39.677251,-104.952721,2.0,-125.0,39.617794,-104.954917,0.5,-121.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
573bf3533e952e19126b256a,0,39.612745,-105.008827,1.0,-134.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
573c0cd0f0fe6e735a699b93,0,39.797969,-105.07346,2.0,-132.0,39.723151,-104.956216,1.0,-120.0,39.732045,-104.973651,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
573c1272f0fe6e735a6cb8bd,0,39.495225,-105.053109,1.0,-133.0,39.612745,-105.008827,1.0,-129.666667,39.654682,-105.043685,...,1.0,-123.333333,0.0,0.0,0.0,0.0,0,0,0,0
573c8ea8864fce1a9a5fbf7a,0,39.771915,-105.009416,1.0,-114.0,39.75539,-105.056406,1.0,-108.666667,39.781464,-105.040763,...,1.0,-106.666667,39.755019,-105.043315,1.0,-98.0,0,0,0,0


In [14]:
# Let's begin with Denver only!
def interpolate_at(df, new_idxs):
    new_idxs = pd.Index(new_idxs)
    df = df.reindex(df.index.append(new_idxs).unique())    
    df = df.sort_index()
    df = df.interpolate()
    return df.loc[new_idxs]



temp_datetime = interpolate_at(temp_denver, pd.concat([df_mess_train.datetime,df_mess_test.datetime]).unique()).reset_index().rename(columns={"index":"datetime"})
humidity_datetime = interpolate_at(humidity_denver, pd.concat([df_mess_train.datetime,df_mess_test.datetime]).unique()).reset_index().rename(columns={"index":"datetime"})


In [15]:
humidity_datetime

Unnamed: 0,datetime,Denver
0,2016-05-18 04:38:49,91.000000
1,2016-05-18 04:45:07,89.000000
2,2016-05-18 06:33:52,98.444444
3,2016-05-18 06:57:54,99.222222
4,2016-05-18 15:47:52,64.750000
...,...,...
11355,2016-12-07 17:53:44,77.285714
11356,2016-12-07 17:53:58,77.214286
11357,2016-12-07 17:58:33,77.142857
11358,2016-12-07 19:06:13,72.000000


In [16]:
df_train_group.reset_index()

index,messid,level_1,0,0,0,0,1,1,1,1,...,3,3,4,4,4,4,5,5,5,5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,bs_lat,bs_lng,nseq,rssi,bs_lat,bs_lng,nseq,rssi,...,nseq,rssi,bs_lat,bs_lng,nseq,rssi,bs_lat,bs_lng,nseq,rssi
0,573bf1d9864fce1a9af8c5c9,0,39.677251,-104.952721,2.0,-125.0,39.617794,-104.954917,0.5,-121.500000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0,0,0,0
1,573bf3533e952e19126b256a,0,39.612745,-105.008827,1.0,-134.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0,0,0,0
2,573c0cd0f0fe6e735a699b93,0,39.797969,-105.073460,2.0,-132.0,39.723151,-104.956216,1.0,-120.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0,0,0,0
3,573c1272f0fe6e735a6cb8bd,0,39.495225,-105.053109,1.0,-133.0,39.612745,-105.008827,1.0,-129.666667,...,1.0,-123.333333,0.000000,0.000000,0.0,0.000000,0,0,0,0
4,573c8ea8864fce1a9a5fbf7a,0,39.771915,-105.009416,1.0,-114.0,39.755390,-105.056406,1.0,-108.666667,...,1.0,-106.666667,39.755019,-105.043315,1.0,-98.000000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6063,5848551912f14360d786ede6,0,39.757034,-104.976127,2.0,-132.0,39.777690,-105.002424,1.5,-128.000000,...,2.0,-124.000000,39.759396,-105.001415,1.0,-117.000000,0,0,0,0
6064,58485a25e541cd0e1329b8d6,0,39.706436,-105.099323,1.0,-137.0,39.677251,-104.952721,1.0,-132.333333,...,1.0,-125.500000,39.612745,-105.008827,1.0,-123.000000,0,0,0,0
6065,58485bd412f14360d78bebdb,0,64.300000,-68.500000,1.5,-130.5,64.300000,-68.500000,0.5,-122.000000,...,1.0,-121.000000,39.793585,-105.018251,1.0,-112.333333,0,0,0,0
6066,5848672e12f14360d7942374,0,39.704887,-105.016392,1.0,-129.0,39.777690,-105.002424,1.0,-128.000000,...,0.0,-121.000000,64.300000,-68.500000,1.5,-115.000000,0,0,0,0


In [17]:
#df_train_group.join()
features = df_train_group.reset_index()
features = features.set_index("messid").join(df_mess_train[["datetime","messid","did"]].set_index("messid")).drop_duplicates().reset_index().\
        merge(temp_datetime, on='datetime').rename(columns={'Denver':"Temperature"}).merge(humidity_datetime, on='datetime').rename(columns={'Denver':"Humidity"}).sort_values(by="datetime")
#.drop(columns=["datetime"])

#df_train_group['temp'] = datetime.apply(lambda x: temp_datetime.loc[x['datetime']], axis=1)
#df_test_group['temp'] = df_test_group.apply(lambda x: temp_datetime.loc[x['datetime']], axis=1)



In [18]:
#features.set_index("messid").join(df_mess_train[["datetime","messid"]].set_index("messid")).drop_duplicates()
features

Unnamed: 0,messid,"(level_1, )","(0, bs_lat)","(0, bs_lng)","(0, nseq)","(0, rssi)","(1, bs_lat)","(1, bs_lng)","(1, nseq)","(1, rssi)",...,"(4, nseq)","(4, rssi)","(5, bs_lat)","(5, bs_lng)","(5, nseq)","(5, rssi)",datetime,did,Temperature,Humidity
0,573bf1d9864fce1a9af8c5c9,0,39.677251,-104.952721,2.0,-125.0,39.617794,-104.954917,0.5,-121.500000,...,0.0,0.000000,0,0,0,0,2016-05-18 04:38:49,473335.0,279.760000,91.000000
1,573bf3533e952e19126b256a,0,39.612745,-105.008827,1.0,-134.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0,0,0,0,2016-05-18 04:45:07,473335.0,279.750000,89.000000
2,573c0cd0f0fe6e735a699b93,0,39.797969,-105.073460,2.0,-132.0,39.723151,-104.956216,1.0,-120.000000,...,0.0,0.000000,0,0,0,0,2016-05-18 06:33:52,473953.0,279.290000,98.444444
3,573c1272f0fe6e735a6cb8bd,0,39.495225,-105.053109,1.0,-133.0,39.612745,-105.008827,1.0,-129.666667,...,0.0,0.000000,0,0,0,0,2016-05-18 06:57:54,476512.0,279.260000,99.222222
4,573c8ea8864fce1a9a5fbf7a,0,39.771915,-105.009416,1.0,-114.0,39.755390,-105.056406,1.0,-108.666667,...,1.0,-98.000000,0,0,0,0,2016-05-18 15:47:52,476286.0,283.207500,64.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6063,5848551912f14360d786ede6,0,39.757034,-104.976127,2.0,-132.0,39.777690,-105.002424,1.5,-128.000000,...,1.0,-117.000000,0,0,0,0,2016-12-07 18:29:45,476207.0,261.427065,75.750000
6064,58485a25e541cd0e1329b8d6,0,39.706436,-105.099323,1.0,-137.0,39.677251,-104.952721,1.0,-132.333333,...,1.0,-123.000000,0,0,0,0,2016-12-07 18:51:17,476512.0,261.431377,74.500000
6065,58485bd412f14360d78bebdb,0,64.300000,-68.500000,1.5,-130.5,64.300000,-68.500000,0.5,-122.000000,...,1.0,-112.333333,0,0,0,0,2016-12-07 18:58:28,476207.0,261.435688,73.250000
6066,5848672e12f14360d7942374,0,39.704887,-105.016392,1.0,-129.0,39.777690,-105.002424,1.0,-128.000000,...,1.5,-115.000000,0,0,0,0,2016-12-07 19:46:54,476257.0,261.820000,72.000000


In [19]:
features.did.nunique()

113

In [20]:
# On recupere les positions 
y = pos_train.drop_duplicates().sort_values(by="messid").drop(columns=["messid"])
X = features.drop(columns=["datetime","did"]).sort_values(by="messid")

In [21]:
X

Unnamed: 0,messid,"(level_1, )","(0, bs_lat)","(0, bs_lng)","(0, nseq)","(0, rssi)","(1, bs_lat)","(1, bs_lng)","(1, nseq)","(1, rssi)",...,"(4, bs_lat)","(4, bs_lng)","(4, nseq)","(4, rssi)","(5, bs_lat)","(5, bs_lng)","(5, nseq)","(5, rssi)",Temperature,Humidity
0,573bf1d9864fce1a9af8c5c9,0,39.677251,-104.952721,2.0,-125.0,39.617794,-104.954917,0.5,-121.500000,...,0.000000,0.000000,0.0,0.000000,0,0,0,0,279.760000,91.000000
1,573bf3533e952e19126b256a,0,39.612745,-105.008827,1.0,-134.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0,0,0,0,279.750000,89.000000
2,573c0cd0f0fe6e735a699b93,0,39.797969,-105.073460,2.0,-132.0,39.723151,-104.956216,1.0,-120.000000,...,0.000000,0.000000,0.0,0.000000,0,0,0,0,279.290000,98.444444
3,573c1272f0fe6e735a6cb8bd,0,39.495225,-105.053109,1.0,-133.0,39.612745,-105.008827,1.0,-129.666667,...,0.000000,0.000000,0.0,0.000000,0,0,0,0,279.260000,99.222222
4,573c8ea8864fce1a9a5fbf7a,0,39.771915,-105.009416,1.0,-114.0,39.755390,-105.056406,1.0,-108.666667,...,39.755019,-105.043315,1.0,-98.000000,0,0,0,0,283.207500,64.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6063,5848551912f14360d786ede6,0,39.757034,-104.976127,2.0,-132.0,39.777690,-105.002424,1.5,-128.000000,...,39.759396,-105.001415,1.0,-117.000000,0,0,0,0,261.427065,75.750000
6064,58485a25e541cd0e1329b8d6,0,39.706436,-105.099323,1.0,-137.0,39.677251,-104.952721,1.0,-132.333333,...,39.612745,-105.008827,1.0,-123.000000,0,0,0,0,261.431377,74.500000
6065,58485bd412f14360d78bebdb,0,64.300000,-68.500000,1.5,-130.5,64.300000,-68.500000,0.5,-122.000000,...,39.793585,-105.018251,1.0,-112.333333,0,0,0,0,261.435688,73.250000
6066,5848672e12f14360d7942374,0,39.704887,-105.016392,1.0,-129.0,39.777690,-105.002424,1.0,-128.000000,...,64.300000,-68.500000,1.5,-115.000000,0,0,0,0,261.820000,72.000000


In [22]:
y

Unnamed: 0,lat,lng
0,39.606690,-104.958490
2,39.637741,-104.958554
3,39.730417,-104.968940
6,39.693102,-105.006995
10,39.758167,-105.051016
...,...,...
39223,39.764915,-105.003985
39228,39.658804,-105.008299
39233,39.778872,-105.019285
39243,39.773264,-105.014052


# Loi de distance

L'idée est d'avoir une fonction rssi -> distance en prenant en compte des différents parametres
Calcul de la distance par ligne dans train

Pas fait, necessaire???


In [None]:
df_mess_train.

# Prepare GBM

In [24]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=200)

In [31]:
columns = X.columns[2:]
import lightgbm

d_train = lightgbm.Dataset(X_train[columns].values, y_train.lat)
d_valid = lightgbm.Dataset(X_test[columns].values, y_test.lat)
    
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

model = lightgbm.train(params,
                       d_train,
                       valid_sets=d_valid,
                       num_boost_round=10000,
                       early_stopping_rounds=100,verbose_eval=50) 


Training until validation scores don't improve for 100 rounds
[50]	valid_0's l2: 0.0137076	valid_0's l1: 0.0333674
[100]	valid_0's l2: 0.00619611	valid_0's l1: 0.0269167
[150]	valid_0's l2: 0.00475662	valid_0's l1: 0.0253039
[200]	valid_0's l2: 0.00380858	valid_0's l1: 0.0236859
[250]	valid_0's l2: 0.00320609	valid_0's l1: 0.0228604
[300]	valid_0's l2: 0.00298153	valid_0's l1: 0.0225004
[350]	valid_0's l2: 0.00280357	valid_0's l1: 0.0221772
[400]	valid_0's l2: 0.00266732	valid_0's l1: 0.0218976
[450]	valid_0's l2: 0.00261077	valid_0's l1: 0.0218279
[500]	valid_0's l2: 0.0025039	valid_0's l1: 0.0214738
[550]	valid_0's l2: 0.00238487	valid_0's l1: 0.0213595
[600]	valid_0's l2: 0.00235586	valid_0's l1: 0.0213624
Early stopping, best iteration is:
[545]	valid_0's l2: 0.00236198	valid_0's l1: 0.0213379


In [32]:
lat_pred = model.predict(X_test[columns].values)

In [33]:
d_train = lightgbm.Dataset(X_train[columns].values, y_train.lng)
d_valid = lightgbm.Dataset(X_test[columns].values, y_test.lng)
    
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

model = lightgbm.train(params,
                       d_train,
                       valid_sets=d_valid,
                       num_boost_round=10000,
                       early_stopping_rounds=100, verbose_eval=50) 

Training until validation scores don't improve for 100 rounds
[50]	valid_0's l2: 0.00730967	valid_0's l1: 0.0271493
[100]	valid_0's l2: 0.00323204	valid_0's l1: 0.0218185
[150]	valid_0's l2: 0.00255854	valid_0's l1: 0.0206163
[200]	valid_0's l2: 0.00222863	valid_0's l1: 0.0199362
[250]	valid_0's l2: 0.00186154	valid_0's l1: 0.019116
[300]	valid_0's l2: 0.00170247	valid_0's l1: 0.0187559
[350]	valid_0's l2: 0.00159124	valid_0's l1: 0.018457
[400]	valid_0's l2: 0.00152111	valid_0's l1: 0.0183645
[450]	valid_0's l2: 0.00150534	valid_0's l1: 0.0184046
[500]	valid_0's l2: 0.00143695	valid_0's l1: 0.0182502
[550]	valid_0's l2: 0.00139495	valid_0's l1: 0.0182254
[600]	valid_0's l2: 0.00138747	valid_0's l1: 0.018202
Early stopping, best iteration is:
[515]	valid_0's l2: 0.00138343	valid_0's l1: 0.0181094


In [34]:
long_pred = model.predict(X_test[columns].values)

In [35]:
lat_pred

array([39.74113564, 39.65039904, 39.79673217, ..., 39.4093566 ,
       39.53169146, 39.77897416])

In [36]:
values = y_test
values["pred_lat"] = lat_pred
values["pred_long"] = long_pred

values['distance'] = values.apply(lambda x: geodesic([x['pred_lat'], x['pred_long']], [x['lat'], x['lng']]).meters, axis=1)
values.distance.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


count      1214.000000
mean       3045.682301
std        5507.185970
min          56.644839
25%         905.605920
50%        1823.171985
75%        3425.598534
max      127277.312070
Name: distance, dtype: float64

# Try DNN
Not Working. Not enough layers?

In [49]:


model = tf.keras.models.Sequential([  
  tf.keras.layers.Dense(64, activation='relu'), 
  tf.keras.layers.Dense(32, activation='relu'),   
  tf.keras.layers.Dense(2, activation='linear')
])


model.compile(optimizer='adam',
              loss='mse',
              metrics=['mse'])

model.fit(X_train[columns].values, y_train.values, epochs=100, batch_size=64)

Train on 4854 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

<tensorflow.python.keras.callbacks.History at 0x1d249e57188>

In [50]:
y_pred = model.predict(X_test[columns].values)



In [51]:
values = y_test
values["pred_lat_dnn"] = y_pred[:,0]
values["pred_long_dnn"] = y_pred[:,1]
values.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,lat,lng,pred_lat,pred_long,distance,pred_lat_dnn,pred_long_dnn,distance_dnn
count,1214.0,1214.0,1214.0,1214.0,1214.0,1214.0,1214.0,1214.0
mean,39.795883,-105.068784,39.7996,-105.072136,3045.682301,40.027306,-104.953423,30506.689802
std,0.393543,0.302347,0.381591,0.294192,5507.18597,0.391183,0.605827,27584.159325
min,35.770683,-105.963263,36.223819,-106.091684,56.644839,37.570866,-107.430733,1032.150662
25%,39.705535,-105.067846,39.711644,-105.066177,905.60592,39.780395,-105.068174,14595.588879
50%,39.774541,-105.042263,39.77393,-105.041608,1823.171985,39.961327,-104.837135,22775.373748
75%,39.81534,-104.995722,39.810705,-105.0011,3425.598534,40.206509,-104.69939,39356.321595
max,40.905761,-102.094289,41.234118,-102.456585,127277.31207,41.440384,-100.074608,264215.536284


In [52]:

values['distance_dnn'] = values.apply(lambda x: geodesic([x['pred_lat_dnn'], x['pred_long_dnn']], [x['lat'], x['lng']]).meters, axis=1)
#geodesic([y_pred[:,0], y_pred[:,1]], [y_test['lat'].values, y_test['lng'].values])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [53]:
values.distance_dnn.describe()

count      1214.000000
mean      45645.041231
std       33996.261332
min        3197.533110
25%       27317.110600
50%       37854.162200
75%       55586.753599
max      376109.090026
Name: distance_dnn, dtype: float64

In [None]:
y_pred

In [None]:
def regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_test):
    
    """ train regressor and make prediction in the train set
      Input: df_feat: feature matrix used to train regressor
             ground_truth_lat: df_feat associated latitude
             ground_truth_lng: df_feat associated longitude
             df_test: data frame used for prediction
      Output: y_pred_lat, y_pred_lng
    """

    X_train = np.array(df_feat);
    reg = linear_model.LinearRegression()

    reg.fit(X_train, np.c_[tan_degree(ground_truth_lat), ground_truth_lng]);
    y_pred = reg.predict(df_test) 

    return arctan_degree(y_pred[:,0]), y_pred[:,1]

In [None]:
#y_pred_lat, y_pred_lng = regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_feat)

## Cross validation

In [None]:
reg = linear_model.LinearRegression()

y_pred = model_selection.cross_val_predict(reg, df_feat, np.c_[tan_degree(ground_truth_lat), ground_truth_lng], cv=5)
y_pred_lat = arctan_degree(y_pred[:,0])
y_pred_lng = y_pred[:,1]

## Evaluate result

In [None]:
def vincenty_vec(vec_coord):
    """ Now using geodesic distance instead of Vincenty """
    vin_vec_dist = np.zeros(vec_coord.shape[0])
    if vec_coord.shape[1] != 4:
        print('ERROR: Bad number of columns (shall be = 4)')
    else:
        vin_vec_dist = [geodesic(vec_coord[m, 0:2], vec_coord[m, 2:]).meters for m in range(vec_coord.shape[0])]

    return vin_vec_dist

In [None]:
# evaluate distance error for each predicted point
def eval_geoloc(y_train_lat , y_train_lng, y_pred_lat, y_pred_lng):
    vec_coord = np.array([y_train_lat , y_train_lng, y_pred_lat, y_pred_lng])
    err_vec = vincenty_vec(np.transpose(vec_coord))
    
    return err_vec

Remove asburd values of latitude and longitude

Clip latitudes to [-90, 90]

In [None]:
err_vec = eval_geoloc(ground_truth_lat, ground_truth_lng, y_pred_lat, y_pred_lng)

## Plot error distribution

In [None]:
def plotError(err_vec):
    
    print(f"error @ 80% = {np.percentile(err_vec, 80):.1f} m")
    
    values, base = np.histogram(err_vec, bins=50000)
    cumulative = np.cumsum(values) 

    plt.figure()
    plt.plot(base[:-1]/1000, cumulative / np.float(np.sum(values))  * 100.0,
             label="Opt LLR", c='blue')

    # plot error @ 80%
    plt.axvline(x=np.percentile(err_vec, 80)/1000., ymin=0, ymax=100,
                linestyle='dashed', color='red')

    plt.xlabel('Distance Error (km)')
    plt.ylabel('Cum proba (%)')
    plt.axis([0, 30, 0, 100]) 

    plt.title('Error Cumulative Probability')
    plt.legend()

    plt.grid()

In [None]:
plotError(err_vec)

#### Analysis

In [None]:
metrics.mean_squared_error(ground_truth_lat, y_pred_lat), \
metrics.mean_squared_error(ground_truth_lng, y_pred_lng)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 9))
axes[0].hist(ground_truth_lat, label='ref', bins=range(-91, 92), density=True)
axes[0].hist(y_pred_lat, label='pred', alpha=0.7, bins=range(-91, 92), density=True)
axes[0].set_title('Latitude histo')
axes[0].grid()
axes[1].hist(ground_truth_lng, label='ref', bins=range(-181, 182), density=True)
axes[1].hist(y_pred_lng, label='pred', alpha=0.7, bins=range(-181, 182), density=True)
axes[1].set_title('Longitude histo')
axes[1].grid()

In [None]:
plausible_lat = (y_pred_lat > -80) & (y_pred_lat < 80)

metrics.mean_squared_error(ground_truth_lat[plausible_lat], y_pred_lat[plausible_lat]), plausible_lat.mean()

In [None]:
plausible_lng = (y_pred_lng > -180) & (y_pred_lng < 180)

metrics.mean_squared_error(ground_truth_lng[plausible_lng], y_pred_lng[plausible_lng]), plausible_lng.mean()

In [None]:
(y_pred_lat < -85).sum(), (y_pred_lng < -180).sum(), (y_pred_lat > 85).sum(), (y_pred_lng > 180).sum()

In [None]:
(plausible_lat ^ plausible_lng).sum(), (plausible_lat & plausible_lng).sum() # XOR

Outliers are the same on latitude and longitude

Let's compute the error without the outliers:

In [None]:
err_vec_plausible = eval_geoloc(ground_truth_lat[plausible_lat], ground_truth_lng[plausible_lat], 
                            y_pred_lat[plausible_lat], y_pred_lng[plausible_lat])
plotError(err_vec_plausible)

## Construct test prediction

In [None]:
df_mess_test.head()

In [None]:
df_feat_test = feat_mat_const(df_mess_test, listOfBs)
df_feat.shape, df_feat_test.shape

In [None]:
y_pred_test_lat, y_pred_test_lng = regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_feat_test)

In [None]:
test_res = pd.DataFrame(np.array([y_pred_test_lat, y_pred_test_lng]).T, columns = ['lat', 'lng'])
test_res = pd.concat([df_mess_test['messid'], test_res], axis=1)

In [None]:
test_res.head()

In [None]:
test_res.to_csv('pred_pos_test_list.csv', index=False)