# Geolocalisation Challenge

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

import os
import sys

In [3]:
os.chdir('..') ; sys.path.append(os.getcwd())

print(os.getcwd())

/home/infres/vcharvet/workspace/pythonProjects/geoloc-challenge


In [4]:
train_features_path = 'data/train_data_featurized_v0.csv'
train_path = 'data/Train/Train_dataset-002.csv'

df_features = pd.read_csv(train_features_path, sep=';')
train_labels = pd.read_csv(train_path, sep=',')[['messageid', 'latitude', 'longitude']]

bsids = pd.read_csv(train_path, sep=',')['bsid'].unique()

FileNotFoundError: File b'data/train_data_featurized_v0.csv' does not exist

In [4]:
df_features.head()

Unnamed: 0,messageid,dtid,time_ux_client,motion,speed,data_type,radius,seqnumber
0,5a74b2a8e541cd137cef7232,5491a93b9336908c3b1b4c70,1517597000000.0,t,0.0,gps,,3197
1,5b216b5ae541cd3845b8c2c5,5491a93b9336908c3b1b4c70,1528917000000.0,t,14.0,gps,,835
2,5addfec9e541cd123f0bd3f5,5491a93b9336908c3b1b4c70,1524498000000.0,f,0.0,gps,,103
3,5a74bdcf12f14352d21e92a9,5491a93b9336908c3b1b4c70,1517600000000.0,f,2.0,gps,,570
4,5b00b43acf554f398fe235dd,5491a93b9336908c3b1b4c70,1526773000000.0,f,1.0,gps,,3079


In [5]:
# grouping labels per message
train_labels = train_labels.groupby('messageid', as_index=True).agg('first')

train_labels.head()

Unnamed: 0_level_0,latitude,longitude
messageid,Unnamed: 1_level_1,Unnamed: 2_level_1
5a497a12cf554f30e846964d,48.788166,2.504432
5a497a24cf554f30e846c069,48.909265,2.310304
5a497a2fcf554f30e846f332,48.752471,2.034534
5a497a4ce541cd714d45b1c5,49.009068,2.351928
5a497a4dcf554f30e8472041,48.804501,2.427322


In [6]:
# checking consistency of features
df_features.shape, train_labels.shape

((2298059, 8), (2298059, 2))

In [None]:
Xy = df_features.merge(train_labels, left_on='message_id')

Xy.head()

## Feature engineering

This step consists in transforming the features in vectors, ie removing NaN values, transforming columns etc...

Further are described in `feature_exploration.ipynb` notebook as well as there distributions and types

### By column

In [None]:
# client specific features
# dtid -> one hot encoding , 0% missing values

# motion: bool  when t = 1 else 0  , 21% Nan: input majority (0)
# speed: numeric, 21% NaN, input(0) (corresponds to missing `motion` so it is set to 0)
Xy['motion'] = Xy['motion'].apply(lambda u: 1 if u == 't' else 0)
Xy.fillna({'motion': 0, 'speed': 0}, inplace=True)

# data-type -> one-hot encoding, 0% missing values
dtype_ohe = pd.get_dummies(Xy['data-type'])
Xy.drop('data-type', axis=1, inplace=True)
Xy = pd.concat([Xy, dtype_ohe], axis=1)


# radius, numeric, 82% missing values input median
radius_median = Xy['radius'].median()
Xy.fillna({'radius': radius_median}, inplace=True)

# seqnumber, numeric, 0% NaN, nothing to do


In [None]:
# creating column `delay` : (time_ux_bsid - time_ux_client)
time_columns = ['time_ux{}'.format(bsid) for bsid in bsids]

for time_column in time_columns:
    Xy[time_column] = (Xy[time_column] // 1000 - Xy['time_ux_client'] // 1000)

Xy.drop('time_ux_client', axis=1, inplace=True) # useful ? maybe cut in hour

There are no missing values in bs specific features, meaning our feature matrix is ready

## Training a model

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from utils.evaluation import vincenty_df, vincenty_vec, criterion, plot_error


In [None]:
X, y = Xy.iloc[:, :-2], Xy.loc[:, ['latitude', 'longitude']]

x_train, x_test, y_train, y_test \
    train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

rf = RandomForestRegressor(n_estimators=100, n_jobs=16).fit(x_train, y_train)

pred_lr = pd.DataFrame(lr.predict(x_test), columns=['prediction'])
pred_lr['messageid'] = x_test.index

pred_rf = pd.DataFrame(rf.predict(x_test), columns=['prediction'])
pred_rf['messageid'] = x_test.index

In [None]:
print('criterion for logistic regression: {:.3f}'\
      .format(criterion(pred_lr['prediction'], x_test))
      
print('criterion for random forest: {:.3f}'\
      .format(criterion(pred_rf['prediction'], x_test))
      

In [None]:
plot_error(x_test, pred_lr['prediction'], True)

In [None]:
plot_error(x_test, pred_rf['prediction'], True)