In [1]:
import numpy as np
import pandas as pd
import dask as dd
import ray as rd

# Single Threaded Benchmarks
This code comes the following blog post: https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6

We use this as a starting point for our implementations as they provide benchmarked results of the two approaches we are comparing.

In [4]:
# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [6]:
# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [8]:
## Load Data
df = pd.read_csv('../blog_code/new_york_hotels.csv', encoding='cp1252')

## Experiment 1: Looping

In [9]:
%%timeit

# Run the haversine looping function
df['distance'] = haversine_looping(df)

650 ms ± 28.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Experiment 2: Iterrows

In [10]:
%%timeit
# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985,\
                                      row['latitude'], row['longitude']))
df['distance'] = haversine_series

173 ms ± 7.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Experiment 3: Apply 

In [11]:
%%timeit

# Timing apply on the Haversine function
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

71.8 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
