In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from geopy.distance import geodesic

from sklearn import linear_model, model_selection, metrics, preprocessing

# Load train and test data

In [2]:
# train set
df_mess_train = pd.read_csv('mess_train_list.csv')

# test set
df_mess_test = pd.read_csv('mess_test_list.csv')

# position associated to train set
pos_train = pd.read_csv('pos_train_list.csv') 

In [3]:
df_mess_train.head()

Unnamed: 0,messid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng
0,573bf1d9864fce1a9af8c5c9,2841,473335.0,0.5,-121.5,1463546000000.0,39.617794,-104.954917
1,573bf1d9864fce1a9af8c5c9,3526,473335.0,2.0,-125.0,1463546000000.0,39.677251,-104.952721
2,573bf3533e952e19126b256a,2605,473335.0,1.0,-134.0,1463547000000.0,39.612745,-105.008827
3,573c0cd0f0fe6e735a699b93,2610,473953.0,2.0,-132.0,1463553000000.0,39.797969,-105.07346
4,573c0cd0f0fe6e735a699b93,3574,473953.0,1.0,-120.0,1463553000000.0,39.723151,-104.956216


# Prepare data

In [4]:
# pip install utm or conda install _c conda-forge utm
import utm


def latlon_to_xy(lat, lon):

    x, y, utm_zone, utm_letter = utm.from_latlon(lat, lon)

    return x, y, utm_zone, utm_letter


def xy_to_latlon(x, y, utm_zone, utm_letter):

    lat, lon = utm.from_latlon(x, y, utm_zone, utm_letter)

    return lat, lon


pos_train[['x', 'y', 'utm_zone', 'utm_letter']] = pos_train.apply(lambda row: pd.Series(latlon_to_xy(row['lat'], row['lng'])),
                                                                  axis=1)

df_mess_train[['bs_x', 'bs_y', 'bs_utm_zone', 'bs_utm_letter']] = df_mess_train.apply(lambda row: pd.Series(latlon_to_xy(row['bs_lat'], row['bs_lng'])),
                                                                                      axis=1)

df_mess_test[['bs_x', 'bs_y', 'bs_utm_zone', 'bs_utm_letter']] = df_mess_test.apply(lambda row: pd.Series(latlon_to_xy(row['bs_lat'], row['bs_lng'])),
                                                                                    axis=1)

In [8]:
# determine all Base stations that received at least 1 message
trainBs  = np.unique(df_mess_train['bsid'])
testBs   = np.unique(df_mess_test['bsid'])
listOfBs = np.union1d(trainBs, testBs) 
testOnlyBs = np.lib.arraysetops.setdiff1d(testBs, trainBs)

print(f"Number of stations: {len(listOfBs):d}, test only {len(testOnlyBs):d}")

Number of stations: 259, test only 8


In [9]:
df_mess_train['did'].unique().shape

(113,)

## Filtering on outliers?


In [10]:
print(f"Nombres de messages : {len(df_mess_train.messid.unique())}");
print(f"Nombres de messages sans les stations du groenland : {len(df_mess_train[df_mess_train.bs_lat<60].messid.unique())}");

Nombres de messages : 6068
Nombres de messages sans les stations du groenland : 5918


In [None]:
def feat_mat_const2(df, listOfBs, keepMax=5):
    """ Feature Matrix construction """
    
    aggCols = ['pivot_lat', 'pivot_lng']
    for i in range(keepMax):
        bsCols =['bs%d_deltalat' % i, 'bs%d_deltalng' % i, 'bs%d_rssi' % i] #'bs%d_active' % i, 
        aggCols = aggCols + bsCols
        
    def aggregateBaseStations(groupBy):
        """ From a RSSI sorted DataFrameGroupBy
            create a dataframe with the 3 best BS 
        """
        
        bsSet = groupBy[groupBy["bs_lat"]<60].iloc[:keepMax]
        
        if(np.sum(groupBy["bs_lat"]<60) <np.sum(groupBy["bs_lat"]>60)):
            bsSet = groupBy[groupBy["bs_lat"]>60].iloc[:keepMax]        
        
        # Barycentre
        w = bsSet['rssi'] / np.sum(bsSet['rssi'])
        lat = np.average(bsSet['bs_lat'], weights=w)
        lng = np.average(bsSet['bs_lng'], weights=w)
        bss = []
        for i in range(keepMax):
            if len(bsSet) > i:
                b = bsSet.iloc[0]
                bss.append([b['bs_lat'] - lat, b['bs_lng'] - lng, b['rssi']])
            else:
                bss.append([0, 0, -1e3])
        return pd.DataFrame(np.concatenate([[lat, lng], np.array(bss).ravel()]).reshape(1, -1), 
                            columns=aggCols)
            
    
    # Keep at max keepMax base-stations per message
    df = df.groupby('messid'). \
        apply(lambda x: x.sort_values(['rssi'], ascending=False)). \
        reset_index(drop=True).groupby('messid').apply(aggregateBaseStations)
    
    return df

In [29]:
def feat_mat_const3(df, listOfBs, keepMax=5):
    """ Feature Matrix construction """
    
    aggCols = ['pivot_lat', 'pivot_lng']
    for i in range(keepMax):
        bsCols =['bs%d_deltay' % i, 'bs%d_deltax' % i, 'bs%d_rssi' % i] #'bs%d_active' % i, 
        aggCols = aggCols + bsCols
        
    def aggregateBaseStations(groupBy):
        """ From a RSSI sorted DataFrameGroupBy
            create a dataframe with the 3 best BS 
        """
        
        
        #bsSet = groupBy[groupBy["bs_lat"]<60].iloc[:keepMax]
        
        #if(np.sum(groupBy["bs_lat"]<60) <np.sum(groupBy["bs_lat"]>60)):
        #    bsSet = groupBy[groupBy["bs_lat"]>60].iloc[:keepMax]    
        
        
        zone = groupBy["bs_utm_zone"].value_counts(ascending=False).index[0]
        bsSet = groupBy[groupBy["bs_utm_zone"] ==zone].iloc[:keepMax]
             
        
        # Barycentre
        w = np.exp(bsSet['rssi']) / np.sum(np.exp(bsSet['rssi']))
        
        x = np.exp(np.sum(w * np.log(bsSet['bs_x'])) / np.sum(w))
        y = np.exp(np.sum(w * np.log(bsSet['bs_y'])) / np.sum(w))
        #lat = np.exp(np.sum(w * np.log(bsSet['bs_lat'])) / np.sum(w))
        #lng = np.exp(np.sum(w * np.log(bsSet['bs_lng'])) / np.sum(w))
        
        #lat = np.average(bsSet['bs_lat'], weights=w)
        #lng = np.average(bsSet['bs_lng'], weights=w)
        
        bss = []
        for i in range(keepMax):
            if len(bsSet) > i:
                b = bsSet.iloc[0]
                
                dx = b['bs_x'] - x
                dy = b['bs_y'] - y
                
 #               if b['bs_lat'] > lat:
#                   dx = geodesic([b['bs_lat'], lng], [lat, lng]).meters
#                else:
#                    dx = -geodesic([b['bs_lat'], lng], [lat, lng]).meters
#                
#                if b['bs_lng'] > lng:
#                    dy = geodesic([lat, b['bs_lng']], [lat, lng]).meters
#                else:
#                    dy = -geodesic([lat, b['bs_lng']], [lat, lng]).meters
        
                #bss.append([b['bs_lat'] - lat, b['bs_lng'] - lng, b['rssi']])
                bss.append([dy, dx, b['rssi']])
            else:
                bss.append([0, 0, -1e3])
        return pd.DataFrame(np.concatenate([[y, x], np.array(bss).ravel()]).reshape(1, -1), 
                            columns=aggCols)
            
    
    # Keep at max keepMax base-stations per message
    df = df.groupby('messid'). \
        apply(lambda x: x.sort_values(['rssi'], ascending=False)). \
        reset_index(drop=True).groupby('messid').apply(aggregateBaseStations)
    
    return df

In [30]:
def ground_truth_const(df_mess_train, pos_train):
    """ Ground truth construction """
    
    df = pd.concat([df_mess_train[['messid']], pos_train], axis=1)
    df_mean = df.groupby('messid').mean()

    return df_mean['x'], df_mean['y']

In [31]:
df_feat = feat_mat_const3(df_mess_train, listOfBs, 5)
df_feat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pivot_lat,pivot_lng,bs0_deltay,bs0_deltax,bs0_rssi,bs1_deltay,bs1_deltax,bs1_rssi,bs2_deltay,bs2_deltax,bs2_rssi,bs3_deltay,bs3_deltax,bs3_rssi,bs4_deltay,bs4_deltax,bs4_rssi
messid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
573bf1d9864fce1a9af8c5c9,0,4385532.0,503875.059261,-193.2865,-5.423667,-121.5,-193.28649,-5.423667,-121.5,0.0,0.0,-1000.0,0.0,0.0,-1000.0,0.0,0.0,-1000.0
573bf3533e952e19126b256a,0,4384777.0,499242.32701,3.72529e-09,-2.328306e-10,-134.0,0.0,0.0,-1000.0,0.0,0.0,-1000.0,0.0,0.0,-1000.0,0.0,0.0,-1000.0
573c0cd0f0fe6e735a699b93,0,4398018.0,502257.92384,2.033077e-06,-3.075635e-06,-100.0,2e-06,-3e-06,-100.0,2e-06,-3e-06,-100.0,0.0,0.0,-1000.0,0.0,0.0,-1000.0
573c1272f0fe6e735a6cb8bd,0,4392668.0,497617.210698,2335.382,977.5454,-123.333333,2335.382479,977.54537,-123.333333,2335.382479,977.54537,-123.333333,2335.382479,977.54537,-123.333333,0.0,0.0,-1000.0
573c8ea8864fce1a9a5fbf7a,0,4400569.0,496289.728728,-0.2606393,-0.2428322,-98.0,-0.260639,-0.242832,-98.0,-0.260639,-0.242832,-98.0,-0.260639,-0.242832,-98.0,-0.260639,-0.242832,-98.0


In [15]:
df_feat.shape

(6068, 17)

In [16]:
df_mess_train.head(1)

Unnamed: 0,messid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng,bs_x,bs_y,bs_utm_zone,bs_utm_letter
0,573bf1d9864fce1a9af8c5c9,2841,473335.0,0.5,-121.5,1463546000000.0,39.617794,-104.954917,503869.635594,4385338.0,13,S


In [17]:
ground_truth_x, ground_truth_y = ground_truth_const(df_mess_train, pos_train)
ground_truth_x.head()



messid
573bf1d9864fce1a9af8c5c9    503563.532188
573bf3533e952e19126b256a    503556.428875
573c0cd0f0fe6e735a699b93    502661.655787
573c1272f0fe6e735a6cb8bd    499400.220948
573c8ea8864fce1a9a5fbf7a    495629.964268
Name: x, dtype: float64

## Make regressor and prediction using the train set

In [None]:
def regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_test, trigo):
    
    """ train regressor and make prediction in the train set
      Input: df_feat: feature matrix used to train regressor
             ground_truth_lat: df_feat associated latitude
             ground_truth_lng: df_feat associated longitude
             df_test: data frame used for prediction
      Output: y_pred_lat, y_pred_lng
    """

    X_train = np.array(df_feat);
    reg = linear_model.LinearRegression()

    if trigo:
        ground_truth_tanlat = tan_degree(ground_truth_lat)
        ground_truth_lng_r, ground_truth_lng_i = deg_to_complex(ground_truth_lng)
        ground_truth = np.c_[ground_truth_tanlat, ground_truth_lng_r, ground_truth_lng_i]
        reg.fit(X_train, ground_truth);
        y_pred = reg.predict(df_test) 
        return arctan_degree(y_pred[:,0]), complex_to_deg(y_pred[:,1:]), reg

    else:
        ground_truth = np.c_[ground_truth_lat, ground_truth_lng]
        reg.fit(X_train, ground_truth);
        y_pred = reg.predict(df_test) 
        return y_pred[:,0], y_pred[:,1], reg

In [None]:
y_pred_lat, y_pred_lng, reg = regressor_and_predict(df_feat, 
                                        ground_truth_lat, ground_truth_lng, df_feat, False)

In [None]:
reg.coef_, reg.intercept_

## Cross validation

In [None]:
def cross_val_fit_predict(df_feat, ground_truth_lat, ground_truth_lng, trigo):
    reg = linear_model.LinearRegression()
    
    if trigo:
        ground_truth_lng_r, ground_truth_lng_i = deg_to_complex(ground_truth_lng)
        ground_truth = np.c_[tan_degree(ground_truth_lat), ground_truth_lng_r, ground_truth_lng_i]

        y_pred = model_selection.cross_val_predict(reg, df_feat, ground_truth, cv=5)

        return arctan_degree(y_pred[:,0]), complex_to_deg(y_pred[:,1:]), reg

    else:
        ground_truth = np.c_[ground_truth_lat, ground_truth_lng]

        y_pred = model_selection.cross_val_predict(reg, df_feat, ground_truth, cv=5)

        return y_pred[:,0], y_pred[:,1]

In [None]:
#y_pred_lat, y_pred_lng = cross_val_fit_predict(df_feat, 
# ground_truth_lat, ground_truth_lng, True)

y_pred_lat, y_pred_lng = cross_val_fit_predict(df_feat, 
                                    ground_truth_lat, ground_truth_lng, False)

## Evaluate result

In [None]:
def vincenty_vec(vec_coord):
    """ Now using geodesic distance instead of Vincenty """
    vin_vec_dist = np.zeros(vec_coord.shape[0])
    if vec_coord.shape[1] != 4:
        print('ERROR: Bad number of columns (shall be = 4)')
    else:
        vin_vec_dist = [geodesic(v[0:2], v[2:]).meters for v in vec_coord]

    return vin_vec_dist

In [None]:
# evaluate distance error for each predicted point
def eval_geoloc(y_train_lat , y_train_lng, y_pred_lat, y_pred_lng):
    vec_coord = np.array([y_train_lat , y_train_lng, y_pred_lat, y_pred_lng])
    err_vec = vincenty_vec(np.transpose(vec_coord))
    
    return err_vec

## Plot error distribution

In [18]:
def plotError(err_vec):
    
    err80 = np.percentile(err_vec, 80)
    
    print(f"error @ 80% = {err80:.1f} m")
    
    values, base = np.histogram(err_vec, bins=50000)
    cumulative = np.cumsum(values) 

    plt.figure()
    plt.plot(base[:-1]/1000, cumulative / np.float(np.sum(values))  * 100.0,
             label="Opt LLR", c='blue')

    # plot error @ 80%
    plt.axvline(x=err80/1000., ymin=0, ymax=100,
                linestyle='dashed', color='red')

    plt.xlabel('Distance Error (km)')
    plt.ylabel('Cum proba (%)')
    plt.axis([0, 30, 0, 100]) 

    plt.title('Error Cumulative Probability')
    plt.legend()

    plt.grid()

In [None]:
err_vec = eval_geoloc(ground_truth_lat, ground_truth_lng, y_pred_lat, y_pred_lng)

In [None]:
plotError(err_vec)

### Analysis

##### Mean square error

In [None]:
metrics.mean_squared_error(ground_truth_lat, y_pred_lat), \
metrics.mean_squared_error(ground_truth_lng, y_pred_lng)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 9))
latBins = np.arange(34, 45, 0.02) #range(-91, 92)
axes[0].hist(ground_truth_lat, label='ref', bins=latBins, density=True)
axes[0].hist(y_pred_lat, label='pred', alpha=0.7, bins=latBins, density=True)
axes[0].set_title('Latitude histo')
axes[0].legend()
axes[0].grid()
lngBins = np.arange(-110, -100, 0.02) #range(-181, 182)
axes[1].hist(ground_truth_lng, label='ref', bins=lngBins, density=True)
axes[1].hist(y_pred_lng, label='pred', alpha=0.7, bins=lngBins, density=True)
axes[1].set_title('Longitude histo')
axes[1].legend()
axes[1].grid()

## Study implausible outliers

May happened when regressing on latitude and longitude angles:
- Latitude lower than -90 degrees or greater than 90 degrees
- Longitude lower than -180 degrees or greater than 180 degrees (which is an indicator more than an issue since we could wrap these longitudes)

In [None]:
(y_pred_lat < -85).sum(), (y_pred_lng < -180).sum(), (y_pred_lat > 85).sum(), (y_pred_lng > 180).sum()

In [None]:
plausible_lat = (y_pred_lat > -80) & (y_pred_lat < 80)
has_implausible_lat = (plausible_lat.sum() != len(plausible_lat))
(plausible_lat.mean(),
metrics.mean_squared_error(ground_truth_lat[plausible_lat], y_pred_lat[plausible_lat]))

In [None]:
if has_implausible_lat:
    plt.scatter(ground_truth_lng[~plausible_lat], ground_truth_lat[~plausible_lat])
    plt.scatter(y_pred_lng[~plausible_lat], y_pred_lat[~plausible_lat])

In [None]:
if has_implausible_lat:
    firstInplausible = list(plausible_lat).index(False)
    firstInplausible, y_pred_lat[firstInplausible], y_pred_lng[firstInplausible]

In [None]:
if has_implausible_lat:
    df4 = df_feat.iloc[firstInplausible]
    firstImplausibleMessid = df4.name
    df4 = df4.unstack()
    df4[df4['active'] == 1]

In [None]:
plausible_lng = (y_pred_lng > -180) & (y_pred_lng < 180)

(plausible_lng.mean(),
metrics.mean_squared_error(ground_truth_lng[plausible_lng], y_pred_lng[plausible_lng]) )

In [None]:
(plausible_lat ^ plausible_lng).sum(), (plausible_lat & plausible_lng).sum() # XOR

Outliers are the same on latitude and longitude

Let's compute the error without the outliers:

In [None]:
if has_implausible_lat:
    err_vec_plausible = eval_geoloc(ground_truth_lat[plausible_lat], ground_truth_lng[plausible_lat], 
                                y_pred_lat[plausible_lat], y_pred_lng[plausible_lat])
    plotError(err_vec_plausible)

# Deep neural Network

In [None]:
from tensorflow.keras import activations, datasets, layers, losses, metrics, models, backend, regularizers
import tensorview as tv
import tensorflow as tf

In [None]:
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(df_feat.values, 
                                                                np.c_[ground_truth_lat.values, ground_truth_lng.values], 
                                                                test_size=0.1)

Normalize data to get proper network optimization

In [None]:
scaleX = preprocessing.StandardScaler()
scaleX.fit(xtrain)
xtrain = scaleX.transform(xtrain)
xtest = scaleX.transform(xtest)

scaleY = preprocessing.StandardScaler()
scaleY.fit(ytrain)
ytrain = scaleY.transform(ytrain)
# NO ytest = scaleY.transform(ytest)

In [None]:
metricNames = ['Loss']

In [None]:
model1 = models.Sequential([
    layers.Dense(64, name='dense_1', activation=activations.relu, input_shape=[df_feat.shape[1]]),
    layers.Dropout(0.01),
    layers.Dense(32, name='dense_2', activation=activations.relu),
    layers.Dropout(0.01),
    layers.Dense(2, name='dense_3', activation=activations.linear),
])

model1.compile(optimizer='adam',
          loss=losses.MeanSquaredError())
    
model1.summary()

In [None]:
nEpochs = 128
batchSize = 64

tvPlot = tv.train.PlotMetricsOnEpoch(metrics_name=metricNames,
                                      cell_size=(6,4), columns=2, iter_num=nEpochs, wait_num=1)

history1 = model1.fit(xtrain, ytrain,
            epochs=nEpochs, batch_size=batchSize, 
            validation_split=0.1, 
            verbose=0,
            callbacks=[tvPlot]);

In [None]:
weights1 = model1.get_weights()
plt.hist(weights1[0].ravel(), bins=30);

In [None]:
yEst = model1.predict(xtest)
yEst = scaleY.inverse_transform(yEst)

In [None]:
metrics.mean_squared_error(ytest[:,0], yEst[:,0]), \
metrics.mean_squared_error(ytest[:,1], yEst[:,1])

In [None]:
dnnErr_vec = eval_geoloc(ytest[:,0], ytest[:,1], yEst[:,0].reshape(-1), yEst[:,1].reshape(-1))
plotError(dnnErr_vec)

# LightGBM

In [32]:
train_data = df_feat[df_feat.columns[2:]]
train_data

Unnamed: 0_level_0,Unnamed: 1_level_0,bs0_deltay,bs0_deltax,bs0_rssi,bs1_deltay,bs1_deltax,bs1_rssi,bs2_deltay,bs2_deltax,bs2_rssi,bs3_deltay,bs3_deltax,bs3_rssi,bs4_deltay,bs4_deltax,bs4_rssi
messid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
573bf1d9864fce1a9af8c5c9,0,-1.932865e+02,-5.423667e+00,-121.500000,-193.286490,-5.423667,-121.500000,0.000000,0.000000,-1000.000000,0.000000,0.000000,-1000.000000,0.000000,0.000000,-1000.000000
573bf3533e952e19126b256a,0,3.725290e-09,-2.328306e-10,-134.000000,0.000000,0.000000,-1000.000000,0.000000,0.000000,-1000.000000,0.000000,0.000000,-1000.000000,0.000000,0.000000,-1000.000000
573c0cd0f0fe6e735a699b93,0,2.033077e-06,-3.075635e-06,-100.000000,0.000002,-0.000003,-100.000000,0.000002,-0.000003,-100.000000,0.000000,0.000000,-1000.000000,0.000000,0.000000,-1000.000000
573c1272f0fe6e735a6cb8bd,0,2.335382e+03,9.775454e+02,-123.333333,2335.382479,977.545370,-123.333333,2335.382479,977.545370,-123.333333,2335.382479,977.545370,-123.333333,0.000000,0.000000,-1000.000000
573c8ea8864fce1a9a5fbf7a,0,-2.606393e-01,-2.428322e-01,-98.000000,-0.260639,-0.242832,-98.000000,-0.260639,-0.242832,-98.000000,-0.260639,-0.242832,-98.000000,-0.260639,-0.242832,-98.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5848551912f14360d786ede6,0,-1.299501e+00,6.254474e-01,-117.000000,-1.299501,0.625447,-117.000000,-1.299501,0.625447,-117.000000,-1.299501,0.625447,-117.000000,0.000000,0.000000,-1000.000000
58485a25e541cd0e1329b8d6,0,-7.822877e+02,5.129536e+01,-123.000000,-782.287703,51.295363,-123.000000,-782.287703,51.295363,-123.000000,-782.287703,51.295363,-123.000000,-782.287703,51.295363,-123.000000
58485bd412f14360d78bebdb,0,1.623202e-05,-5.188223e-06,-112.333333,0.000016,-0.000005,-112.333333,0.000016,-0.000005,-112.333333,0.000016,-0.000005,-112.333333,0.000016,-0.000005,-112.333333
5848672e12f14360d7942374,0,2.148508e+00,-1.597670e+01,-121.000000,2.148508,-15.976701,-121.000000,2.148508,-15.976701,-121.000000,2.148508,-15.976701,-121.000000,0.000000,0.000000,-1000.000000


In [33]:
import scipy
labels = np.c_[ground_truth_y.values - df_feat.pivot_lat.values, ground_truth_x.values - df_feat.pivot_lng.values]
scipy.stats.describe(labels)

DescribeResult(nobs=6068, minmax=(array([-2750103.7403647 ,  -107265.55916399]), array([73271.88635453, 23766.47414421])), mean=array([-95588.59627333,  -1367.06725658]), variance=array([2.42788132e+11, 1.14252448e+08]), skewness=array([-5.13038439, -7.34499613]), kurtosis=array([24.45131835, 65.05185149]))

In [22]:
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(train_data, 
                                                               labels, 
                                                                test_size=0.1)

In [26]:
import lightgbm

lat_train = lightgbm.Dataset(xtrain, ytrain[:,0])
lat_valid = lightgbm.Dataset(xtest, ytest[:,0])
    
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 250,
    'learning_rate': 0.5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

model_lat = lightgbm.train(params,
                       lat_train,
                       valid_sets=lat_valid,
                       num_boost_round=10000,
                       early_stopping_rounds=100,verbose_eval=True) 

[1]	valid_0's l2: 1.63994e+11	valid_0's l1: 176070
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 9.50914e+10	valid_0's l1: 116057
[3]	valid_0's l2: 7.9342e+10	valid_0's l1: 84384.4
[4]	valid_0's l2: 5.18476e+10	valid_0's l1: 59085.4
[5]	valid_0's l2: 4.20991e+10	valid_0's l1: 48322.4
[6]	valid_0's l2: 3.39816e+10	valid_0's l1: 41771.9
[7]	valid_0's l2: 3.3084e+10	valid_0's l1: 42312.8
[8]	valid_0's l2: 3.10009e+10	valid_0's l1: 41537.6
[9]	valid_0's l2: 3.08155e+10	valid_0's l1: 42552.3
[10]	valid_0's l2: 2.92184e+10	valid_0's l1: 41761.2
[11]	valid_0's l2: 2.97866e+10	valid_0's l1: 42335.5
[12]	valid_0's l2: 2.95339e+10	valid_0's l1: 43415.1
[13]	valid_0's l2: 2.78979e+10	valid_0's l1: 42080
[14]	valid_0's l2: 2.67047e+10	valid_0's l1: 42328.1
[15]	valid_0's l2: 2.64796e+10	valid_0's l1: 41751.3
[16]	valid_0's l2: 2.6473e+10	valid_0's l1: 42667.8
[17]	valid_0's l2: 2.57926e+10	valid_0's l1: 42402
[18]	valid_0's l2: 2.56084e+10	valid_0's l1: 42471.6
[1

[170]	valid_0's l2: 1.7999e+10	valid_0's l1: 40163.5
[171]	valid_0's l2: 1.80006e+10	valid_0's l1: 40124.8
[172]	valid_0's l2: 1.80264e+10	valid_0's l1: 40137.7
[173]	valid_0's l2: 1.79809e+10	valid_0's l1: 40228.1
[174]	valid_0's l2: 1.79904e+10	valid_0's l1: 40118
[175]	valid_0's l2: 1.80309e+10	valid_0's l1: 40252.5
[176]	valid_0's l2: 1.80359e+10	valid_0's l1: 40293.9
[177]	valid_0's l2: 1.80791e+10	valid_0's l1: 40275.6
[178]	valid_0's l2: 1.79862e+10	valid_0's l1: 40293.6
[179]	valid_0's l2: 1.79185e+10	valid_0's l1: 40197.4
[180]	valid_0's l2: 1.79067e+10	valid_0's l1: 40127.9
[181]	valid_0's l2: 1.79702e+10	valid_0's l1: 40167.7
[182]	valid_0's l2: 1.79851e+10	valid_0's l1: 40215
[183]	valid_0's l2: 1.80065e+10	valid_0's l1: 40204.8
[184]	valid_0's l2: 1.79932e+10	valid_0's l1: 40194.6
[185]	valid_0's l2: 1.80439e+10	valid_0's l1: 40248.2
[186]	valid_0's l2: 1.80373e+10	valid_0's l1: 40194.7
[187]	valid_0's l2: 1.80459e+10	valid_0's l1: 40184.9
[188]	valid_0's l2: 1.79858e+10	v

In [None]:
lng_train = lightgbm.Dataset(xtrain, ytrain[:, 1])
lng_valid = lightgbm.Dataset(xtest, ytest[:, 1])

model_lng = lightgbm.train(params,
                           lng_train,
                           valid_sets=lng_valid,
                           num_boost_round=1000,
                           early_stopping_rounds=100, verbose_eval=False)

In [None]:
lat_pred = model_lat.predict(xtest)
lng_pred = model_lng.predict(xtest)

delta_y = lat_pred - ytest[:,0]
delta_x = lng_pred - ytest[:,1]
#dnnErr_vec = eval_geoloc(ytest[:,0], ytest[:,1], lat_pred, lng_pred)*

dnnErr_vec = np.sqrt(delta_x**2 + delta_y**2)

plotError(dnnErr_vec)

## Adding extra values

In [None]:
df_mess_train["datetime"]=pd.to_datetime(df_mess_train.time_ux, unit='ms')
df_mess_test["datetime"]=pd.to_datetime(df_mess_test.time_ux, unit='ms')

In [None]:
humidity = pd.read_csv('humidity.csv')
humidity.datetime = pd.to_datetime(humidity.datetime)
humidity_denver = humidity[['datetime',"Denver"]]
humidity_denver.set_index("datetime", inplace=True)
temperature = pd.read_csv('temperature.csv')
temperature.datetime = pd.to_datetime(temperature.datetime)
temp_denver = temperature[['datetime',"Denver"]]
temp_denver.set_index("datetime", inplace=True)
datetime = df_mess_train.datetime
datetime.drop_duplicates(inplace=True)
city_attributes = pd.read_csv("city_attributes.csv")

In [None]:
# Let's begin with Denver only!
def interpolate_at(df, new_idxs):
    new_idxs = pd.Index(new_idxs)
    df = df.reindex(df.index.append(new_idxs).unique())    
    df = df.sort_index()
    df = df.interpolate()
    return df.loc[new_idxs]



temp_datetime = interpolate_at(temp_denver, pd.concat([df_mess_train.datetime,df_mess_test.datetime]).unique()).reset_index().rename(columns={"index":"datetime"})
humidity_datetime = interpolate_at(humidity_denver, pd.concat([df_mess_train.datetime,df_mess_test.datetime]).unique()).reset_index().rename(columns={"index":"datetime"})

In [None]:
df_feat_2 = df_feat.reset_index().set_index("messid").join(df_mess_train[["messid", "datetime"]].set_index("messid")).drop_duplicates().reset_index().\
merge(temp_datetime, on='datetime').rename(columns={'Denver':"Temperature"}).\
merge(humidity_datetime, on='datetime').rename(columns={'Denver':"Humidity"}).drop(columns=["datetime"]).set_index("messid")


In [None]:
df_feat_2[df_feat_2.pivot_lat>60]["Temperature"] = 273.15
df_feat_2[df_feat_2.pivot_lat>60]["Humidity"] = 0

In [None]:
df_feat_2

In [None]:
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(df_feat_2.values, 
                                                                np.c_[ground_truth_lat.values, ground_truth_lng.values], 
                                                                test_size=0.1)

In [None]:
import lightgbm

lat_train = lightgbm.Dataset(xtrain, ytrain[:,0])
lat_valid = lightgbm.Dataset(xtest, ytest[:,0])
    
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

model_lat = lightgbm.train(params,
                       lat_train,
                       valid_sets=lat_valid,
                       num_boost_round=1000,
                       early_stopping_rounds=100,verbose_eval=False) 

In [None]:

lng_train = lightgbm.Dataset(xtrain, ytrain[:,1])
lng_valid = lightgbm.Dataset(xtest, ytest[:,1])
    
model_lng = lightgbm.train(params,
                       lng_train,
                       valid_sets=lng_valid,
                       num_boost_round=1000,
                       early_stopping_rounds=100,verbose_eval=False) 

In [None]:
lat_pred = model_lat.predict(xtest)
lng_pred = model_lng.predict(xtest)

dnnErr_vec = eval_geoloc(ytest[:,0], ytest[:,1], lat_pred, lng_pred)
plotError(dnnErr_vec)

## Construct test prediction

In [None]:
df_mess_test.head()

In [None]:
df_feat_test = feat_mat_const2(df_mess_test, listOfBs)
df_feat.shape, df_feat_test.shape

In [None]:
y_pred_test_lat, y_pred_test_lng, reg = regressor_and_predict(df_feat, ground_truth_lat, 
                                                    ground_truth_lng, df_feat_test, False)

In [None]:
test_res = pd.DataFrame(np.array([y_pred_test_lat, y_pred_test_lng]).T, columns = ['lat', 'lng'])
test_res = pd.concat([df_mess_test['messid'], test_res], axis=1)

In [None]:
test_res.head()

In [None]:
test_res.to_csv('pred_pos_test_list.csv', index=False)