In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geopy.distance import vincenty
from sklearn import linear_model, model_selection

In [None]:
# load train and test data
df_mess_train = pd.read_csv('mess_train_list.csv') # train set
df_mess_test = pd.read_csv('mess_test_list.csv') # test set
pos_train = pd.read_csv('pos_train_list.csv') # position associated to train set

In [None]:
df_mess_train.head()

In [None]:
df_mess_train.describe()

In [None]:
pos_train.head()

In [None]:
pos_train.describe()

In [None]:
listOfBs = np.union1d(np.unique(df_mess_train['bsid']), np.unique(df_mess_test['bsid'])) # determine all Base stations that received at least 1 message

In [None]:
len(listOfBs)

In [None]:
df_mess_train['did'].unique().shape

In [None]:
# Feature Matrix construction 
def feat_mat_const(df_mess_train, listOfBs):
    df = df_mess_train
    df['active'] = 1
    #df = df[df['bs_lat'] < 50]
    return df.pivot_table(index='messid', 
                                    values=['active', 'nseq','rssi', 'bs_lat', 'bs_lng'], 
                                    columns=['bsid'],
                                    fill_value=0)

In [None]:
# ground truth construction
def ground_truth_const(df_mess_train, pos_train):
    
    df = pd.concat([df_mess_train[['messid', 'bs_lat']], pos_train], axis=1)
    #df = df[df['bs_lat'] < 50]
    df2 = df.groupby('messid').mean()
    print(df2.columns)
    return df2['lat'], df2['lng']

In [None]:
df_feat = feat_mat_const(df_mess_train, listOfBs)
df_feat.head()

In [None]:
#df_feat.reset_index(1).head()

In [None]:
df_feat.shape

In [None]:
ground_truth_lat, ground_truth_lng = ground_truth_const(df_mess_train, pos_train)
ground_truth_lat.shape

In [None]:
# Validation

In [None]:
def regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_test):
    
    # train regressor and make prediction in the train set
    # Input: df_feat: feature matrix used to train regressor
    #        ground_truth_lat: df_feat associated latitude
    #        ground_truth_lng: df_feat associated longitude
    #        df_test: data frame used for prediction
    # Output: y_pred_lat, y_pred_lng

    X_train = np.array(df_feat);
    reg = linear_model.LinearRegression()

    reg.fit(X_train, ground_truth_lat);
    y_pred_lat = reg.predict(df_test) 

    reg.fit(X_train, ground_truth_lng);
    y_pred_lng = reg.predict(df_test) 
    
    return y_pred_lat, y_pred_lng

## Make regressor and prediction using the train set

In [None]:
y_pred_lat, y_pred_lng = regressor_and_predict(df_feat, ground_truth_lat, ground_truth_lng, df_feat)

## Cross validation

In [None]:
reg = linear_model.LinearRegression()
y_pred_lng = model_selection.cross_val_predict(reg, df_feat, ground_truth_lng, cv=5)
y_pred_lat = model_selection.cross_val_predict(reg, df_feat, ground_truth_lat, cv=5)

## Evaluate result

In [None]:
def vincenty_vec(vec_coord):
    vin_vec_dist = np.zeros(vec_coord.shape[0])
    if vec_coord.shape[1] !=  4:
        print('ERROR: Bad number of columns (shall be = 4)')
    else:
        vin_vec_dist = [vincenty(vec_coord[m,0:2],vec_coord[m,2:]).meters for m in range(vec_coord.shape[0])]
    return vin_vec_dist

In [None]:
# evaluate distance error for each predicted point
def Eval_geoloc(y_train_lat , y_train_lng, y_pred_lat, y_pred_lng):
    vec_coord = np.array([y_train_lat , y_train_lng, y_pred_lat, y_pred_lng])
    err_vec = vincenty_vec(np.transpose(vec_coord))
    
    return err_vec

In [None]:
correct = (y_pred_lat > -90) & (y_pred_lat < 90) \
    & (y_pred_lng > -180) & (y_pred_lng <= 180)
y_pred_lat_clean = y_pred_lat[correct]
y_pred_lng_clean = y_pred_lng[correct]
ground_truth_lat_clean = ground_truth_lat[correct]
ground_truth_lng_clean = ground_truth_lng[correct]

In [None]:
err_vec = Eval_geoloc(ground_truth_lat_clean, ground_truth_lng_clean, y_pred_lat_clean, y_pred_lng_clean)

## Plot error distribution

In [None]:
values, base = np.histogram(err_vec, bins=50000)
cumulative = np.cumsum(values) 
plt.figure();
plt.plot(base[:-1]/1000, cumulative / np.float(np.sum(values))  * 100.0, c='blue')
plt.xlabel('Distance Error (km)'); plt.ylabel('Cum proba (%)'); plt.axis([0, 30, 0, 100]); 
plt.title('Error Cumulative Probability'); plt.legend( ["Opt LLR", "LLR 95", "LLR 99"])
plt.grid(); 

## Error criterion

In [None]:
np.percentile(err_vec, 80)

In [None]:
# Construct test prediction