In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from geopy.distance import geodesic

from sklearn import linear_model, model_selection, metrics

## Load train and test data

In [None]:
# train set
df_mess_train = pd.read_csv('mess_train_list.csv')

# test set
df_mess_test = pd.read_csv('mess_test_list.csv')

# position associated to train set
pos_train = pd.read_csv('pos_train_list.csv') 

In [None]:
df_mess_train.head()

In [None]:
print(df_mess_train.shape)
df_mess_train.describe()

In [None]:
pos_train.head()

In [None]:
pos_train.describe()

## Prepare data

In [None]:
# determine all Base stations that received at least 1 message
trainBs  = np.unique(df_mess_train['bsid'])
testBs   = np.unique(df_mess_test['bsid'])
listOfBs = np.union1d(trainBs, testBs) 
testOnlyBs = np.lib.arraysetops.setdiff1d(testBs, trainBs)

print(f"Number of stations: %d, test only %d" % (len(listOfBs), len(testOnlyBs)))

In [None]:
df_mess_train['did'].unique().shape

In [None]:
def tan_degree(x):
    """ Tangent for degree values (latitude) """
    return np.tan(x * np.pi / 180)

def arctan_degree(x):
    """ arc tan to degree """
    return np.arctan(x) * 180 / np.pi

In [None]:
def feat_mat_const(df, listOfBs):
    """ Feature Matrix construction """
    
    # Add active column
    df['active'] = 1
    
    # Add tangent of latitude
    df['bs_tanlat'] = tan_degree(df['bs_lat'])
    
    # Add extra rows for Base-stations that are not present in this dataset
    usedBs = np.unique(df['bsid'])
    missingBs = np.lib.arraysetops.setdiff1d(listOfBs, usedBs)
    df2 = pd.DataFrame([['-1', bs, 0, 0, 0, 0, 0, 0, 0, 0] for bs in missingBs], columns=df.columns)
    df = df.append(df2)
    
    #df = df[df['bs_lat'] < 50]
    
    # Pivot BS to columns
    df = df.pivot_table(index='messid', 
                                    values=['active', 'nseq', 'rssi', 'bs_tanlat', 'bs_lng'], 
                                    columns=['bsid'],
                                    fill_value=0)
    resDf = df.reorder_levels([1, 0], axis=1).sort_index(level=0, axis=1)
    return resDf.drop(['-1'])

In [None]:
def ground_truth_const(df_mess_train, pos_train):
    """ Ground truth construction """
    
    df = pd.concat([df_mess_train[['messid']], pos_train], axis=1)
    df_mean = df.groupby('messid').mean()

    return df_mean['lat'], df_mean['lng']

In [None]:
df_feat = feat_mat_const(df_mess_train, listOfBs)
df_feat.head()

In [None]:
df_feat.shape

In [None]:
ground_truth_lat, ground_truth_lng = ground_truth_const(df_mess_train, pos_train)
ground_truth_lat.shape

## Make regressor and prediction using the train set

In [None]:
def regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_test):
    
    """ train regressor and make prediction in the train set
      Input: df_feat: feature matrix used to train regressor
             ground_truth_lat: df_feat associated latitude
             ground_truth_lng: df_feat associated longitude
             df_test: data frame used for prediction
      Output: y_pred_lat, y_pred_lng
    """

    X_train = np.array(df_feat);
    reg = linear_model.LinearRegression()

    reg.fit(X_train, np.c_[tan_degree(ground_truth_lat), ground_truth_lng]);
    y_pred = reg.predict(df_test) 

    return arctan_degree(y_pred[:,0]), y_pred[:,1]

In [None]:
#y_pred_lat, y_pred_lng = regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_feat)

## Cross validation

In [None]:
reg = linear_model.LinearRegression()

y_pred = model_selection.cross_val_predict(reg, df_feat, np.c_[tan_degree(ground_truth_lat), ground_truth_lng], cv=5)
y_pred_lat = arctan_degree(y_pred[:,0])
y_pred_lng = y_pred[:,1]

## Evaluate result

In [None]:
def vincenty_vec(vec_coord):
    """ Now using geodesic distance instead of Vincenty """
    vin_vec_dist = np.zeros(vec_coord.shape[0])
    if vec_coord.shape[1] != 4:
        print('ERROR: Bad number of columns (shall be = 4)')
    else:
        vin_vec_dist = [geodesic(vec_coord[m, 0:2], vec_coord[m, 2:]).meters for m in range(vec_coord.shape[0])]

    return vin_vec_dist

In [None]:
# evaluate distance error for each predicted point
def eval_geoloc(y_train_lat , y_train_lng, y_pred_lat, y_pred_lng):
    vec_coord = np.array([y_train_lat , y_train_lng, y_pred_lat, y_pred_lng])
    err_vec = vincenty_vec(np.transpose(vec_coord))
    
    return err_vec

Remove asburd values of latitude and longitude

Clip latitudes to [-90, 90]

In [None]:
err_vec = eval_geoloc(ground_truth_lat, ground_truth_lng, y_pred_lat, y_pred_lng)

## Plot error distribution

In [None]:
def plotError(err_vec):
    
    print(f"error @ 80% = {np.percentile(err_vec, 80):.1f} m")
    
    values, base = np.histogram(err_vec, bins=50000)
    cumulative = np.cumsum(values) 

    plt.figure()
    plt.plot(base[:-1]/1000, cumulative / np.float(np.sum(values))  * 100.0,
             label="Opt LLR", c='blue')

    # plot error @ 80%
    plt.axvline(x=np.percentile(err_vec, 80)/1000., ymin=0, ymax=100,
                linestyle='dashed', color='red')

    plt.xlabel('Distance Error (km)')
    plt.ylabel('Cum proba (%)')
    plt.axis([0, 30, 0, 100]) 

    plt.title('Error Cumulative Probability')
    plt.legend()

    plt.grid()

In [None]:
plotError(err_vec)

#### Analysis

In [None]:
metrics.mean_squared_error(ground_truth_lat, y_pred_lat), \
metrics.mean_squared_error(ground_truth_lng, y_pred_lng)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 9))
axes[0].hist(ground_truth_lat, label='ref', bins=range(-91, 92), density=True)
axes[0].hist(y_pred_lat, label='pred', alpha=0.7, bins=range(-91, 92), density=True)
axes[0].set_title('Latitude histo')
axes[0].grid()
axes[1].hist(ground_truth_lng, label='ref', bins=range(-181, 182), density=True)
axes[1].hist(y_pred_lng, label='pred', alpha=0.7, bins=range(-181, 182), density=True)
axes[1].set_title('Longitude histo')
axes[1].grid()

In [None]:
plausible_lat = (y_pred_lat > -80) & (y_pred_lat < 80)

metrics.mean_squared_error(ground_truth_lat[plausible_lat], y_pred_lat[plausible_lat]), plausible_lat.mean()

In [None]:
plausible_lng = (y_pred_lng > -180) & (y_pred_lng < 180)

metrics.mean_squared_error(ground_truth_lng[plausible_lng], y_pred_lng[plausible_lng]), plausible_lng.mean()

In [None]:
(y_pred_lat < -85).sum(), (y_pred_lng < -180).sum(), (y_pred_lat > 85).sum(), (y_pred_lng > 180).sum()

In [None]:
(plausible_lat ^ plausible_lng).sum(), (plausible_lat & plausible_lng).sum() # XOR

Outliers are the same on latitude and longitude

Let's compute the error without the outliers:

In [None]:
err_vec_plausible = eval_geoloc(ground_truth_lat[plausible_lat], ground_truth_lng[plausible_lat], 
                            y_pred_lat[plausible_lat], y_pred_lng[plausible_lat])
plotError(err_vec_plausible)

## Construct test prediction

In [None]:
df_mess_test.head()

In [None]:
df_feat_test = feat_mat_const(df_mess_test, listOfBs)
df_feat.shape, df_feat_test.shape

In [None]:
y_pred_test_lat, y_pred_test_lng = regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_feat_test)

In [None]:
test_res = pd.DataFrame(np.array([y_pred_test_lat, y_pred_test_lng]).T, columns = ['lat', 'lng'])
test_res = pd.concat([df_mess_test['messid'], test_res], axis=1)

In [None]:
test_res.head()

In [None]:
test_res.to_csv('pred_pos_test_list.csv', index=False)