In [24]:
import datetime, warnings, pickle, gc, os, math, pprint, hashlib, functools, random, timeit

import numpy           as np
import pandas          as pd
import multiprocessing as mp

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn           as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import normalize, StandardScaler
from sklearn.metrics         import classification_report, roc_curve, auc,accuracy_score, confusion_matrix, f1_score, fbeta_score
from sklearn.calibration     import CalibratedClassifierCV, calibration_curve
from boruta                  import BorutaPy
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble     import RandomForestClassifier
import lightgbm as lgb    

from IPython.display import display

random.seed(4321)

In [25]:
mp.cpu_count()

16

In [38]:
from math          import sin, cos, sqrt, atan2, radians
from scipy.spatial import distance

In [33]:
def lat_lon_converter(lat1, lon1, lat2, lon2, unit):
    """
    ref: https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
    """
    try:
        R = 6373.0
        dlon = radians(lon2) - radians(lon1)
        dlat = radians(lat2) - radians(lat1)
        a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = R * c

        if unit == 'm':
            return distance
        elif unit == 'km':
            return distance
    except ValueError:
        return np.nan

In [35]:
path = "~/Documents/data/nyc-taxi-trip-duration/"

In [36]:
df_train = pd.read_csv(rf'{path}/train.csv')

In [105]:
train = df_train.sample(1000000)

serial

In [106]:
%%time
train['dist_manhattan_meter1'] = train.apply( lambda x: lat_lon_converter(x['pickup_latitude'], 
                                                                         x['pickup_longitude'],
                                                                         x['dropoff_latitude'], 
                                                                         x['dropoff_longitude'],
                                                                         'm'), axis=1 )



CPU times: user 10.6 s, sys: 71.5 ms, total: 10.7 s
Wall time: 10.7 s


paralelo

In [107]:
def lat_lon_converter2(_df):
    """
    ref: https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
    """
    lat1 = _df['pickup_latitude']
    lon1 = _df['pickup_longitude']
    lat2 = _df['dropoff_latitude']
    lon2 = _df['dropoff_longitude']
    unit = "m"
    try:
        R = 6373.0
        dlon = radians(lon2) - radians(lon1)
        dlat = radians(lat2) - radians(lat1)
        a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = R * c

        if unit == 'm':
            return distance
        elif unit == 'km':
            return distance
    except ValueError:
        return np.nan

In [108]:
def parallelize(_df, _func, _num_of_processes=2):
    data_split = np.array_split(_df, _num_of_processes)
    pool       = mp.Pool(_num_of_processes)
    data       = pd.concat(pool.map(_func, data_split))
    pool.close()
    pool.join()
    return data

In [109]:
def run_on_subset(_func, data_subset):
    return data_subset.apply(_func, axis=1)

In [110]:
def parallelize_on_rows(_df, _func, _num_of_processes=2):
    return parallelize(_df, functools.partial(run_on_subset, _func), _num_of_processes)

In [111]:
%%time
train['dist_manhattan_meter2'] = parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2) 

CPU times: user 93.3 ms, sys: 96.3 ms, total: 190 ms
Wall time: 5.57 s


usando `timeit`

In [112]:
%timeit train.apply( lambda x: lat_lon_converter(x['pickup_latitude'], x['pickup_longitude'],x['dropoff_latitude'], x['dropoff_longitude'],'m'), axis=1 )

10.6 s ± 23.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [113]:
%timeit parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2, 2) 

5.68 s ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [114]:
%timeit parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2, 4) 

2.95 s ± 20.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [115]:
%timeit parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2, 6) 

2.18 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [116]:
%timeit parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2, 8) 

1.81 s ± 81.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [117]:
%timeit parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2, 10) 

1.88 s ± 47.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [118]:
%timeit parallelize_on_rows(train[['pickup_latitude', 'pickup_longitude','dropoff_latitude', 'dropoff_longitude']], lat_lon_converter2, 12) 

1.76 s ± 38.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


compara resultados

In [119]:
train[['dist_manhattan_meter1', 'dist_manhattan_meter1']]

Unnamed: 0,dist_manhattan_meter1,dist_manhattan_meter1.1
519220,1.862959,1.862959
938653,2.282844,2.282844
1395531,11.739124,11.739124
1108433,2.434507,2.434507
1130566,1.759047,1.759047
...,...,...
1267416,0.719226,0.719226
852552,0.196220,0.196220
204282,7.494901,7.494901
19618,1.276926,1.276926


.