In [None]:
# import necessary libraries
import time
import pandas as pd
import numpy as np
from math import *
import cProfile

## Overview of the Data

In [None]:
# check the dataset
df = pd.read_excel('/content/clinics.xls')
df.head()

Unnamed: 0,bizID,bizCat,bizCatSub,bizName,bizAddr,bizCity,bizState,bizZip,bizPhone,bizFax,...,bizURL,locAreaCode,locFIPS,locTimeZone,locDST,locLat,locLong,locMSA,locPMSA,locCounty
0,1,Clinics,Clinics,Hino Ronald H MD,98-151 Pali Momi Street Suite 142,Aiea,HI,96701,(808)487-2477,,...,,808,15003,PST-2,N,21.398,-157.8981,3320.0,,Honolulu
1,2,Clinics,Clinics,Farmer Joesph F Md,1225 Breckenridge Drive,Little Rock,AR,72205,(501)225-2594,,...,,501,5119,CST,Y,34.7495,-92.3533,4400.0,,Pulaski
2,3,Clinics,Clinics & Medical Centers,Najjar Fadi Md,1155 West Linda Avenue Suite B,Hermiston,OR,97838,(541)289-1122,,...,,541,41059,PST,Y,45.8456,-119.2817,,,Umatilla
3,4,Clinics,Clinics & Medical Centers,Kittson Memorial Upper Level Nursing Home,1010 South Birch Avenue,Hallock,MN,56728,(218)843-2525,,...,,218,27069,CST,Y,48.7954,-97.009,,,Kittson
4,5,Clinics,Clinics & Medical Centers,Thompson Robert B Md,100 North Eagle Creek Drive,Lexington,KY,40509,(859)258-4000,,...,www.lexingtonclinic.com,859,21067,EST,Y,37.9935,-84.3712,4280.0,,Fayette


In [None]:
df.shape

(30, 21)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bizID        30 non-null     int64  
 1   bizCat       30 non-null     object 
 2   bizCatSub    30 non-null     object 
 3   bizName      30 non-null     object 
 4   bizAddr      30 non-null     object 
 5   bizCity      30 non-null     object 
 6   bizState     30 non-null     object 
 7   bizZip       30 non-null     int64  
 8   bizPhone     30 non-null     object 
 9   bizFax       0 non-null      float64
 10  bizEmail     0 non-null      float64
 11  bizURL       5 non-null      object 
 12  locAreaCode  30 non-null     int64  
 13  locFIPS      30 non-null     int64  
 14  locTimeZone  30 non-null     object 
 15  locDST       30 non-null     object 
 16  locLat       30 non-null     float64
 17  locLong      30 non-null     float64
 18  locMSA       13 non-null     float64
 19  locPMSA   

* There are over 20 varibles but we only need the latitude and longitude.
* The location column (state) may also be helpful.

In [None]:
# subset data with necessary columns
df = df[['bizState', 'locLat', 'locLong']]
df = df.rename(columns = {'bizState': 'state', 'locLat':'latitude', 'locLong':'longitude'})
df.sample(8, random_state =1)

Unnamed: 0,state,latitude,longitude
17,GA,34.5591,-83.4634
21,IA,43.3826,-91.3585
10,CA,38.5503,-121.4572
19,CA,34.0649,-118.3829
14,MD,39.4025,-76.6344
20,MS,32.3121,-89.2019
26,AL,33.5089,-86.8014
3,MN,48.7954,-97.009


* The data is successfully subsetted with the necessary varaibles.

## Haversine Function

In [None]:
# define the distance computation function
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    total_miles = MILES * c
    return total_miles

* The harversine function is created and needs to be applied to various computation methods.

## Applying the Harversine Function

In [None]:
# basic for-loop
%%timeit
for i in range(len(df)):
    haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])

5.51 ms ± 730 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# vectorize code by using iterrows
%%timeit
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

5.15 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# optimize further using apply()
%%timeit
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

# profile execution with cProfile
"""
cProfile.run("df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)")
"""

1.19 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
# vectorized implementation of haversine using pandas
%%timeit
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

# profile execution time
"""
cProfile.run("df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])")
"""

1.37 ms ± 257 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
# vectorized implementation of Haversine using NumPy
%%timeit
df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

# profile execution time
"""
cProfile.run("df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)")
"""

226 µs ± 92.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


* The results for 5 methods are computed and needs comparison.

## Comparing the Results

In [None]:
execute_times = {
    'for loop': '5.51 ms ± 730 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)',
    'iterrows': '5.15 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 100 loops each))',
    'apply': '1.19 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)',
    'vec_implement_pandas': '1.37 ms ± 257 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)',
    'vec_implement_numpy': '226 µs ± 92.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)'
}

for k, v in execute_times.items():
    print(f"{k}: {v}")

for loop: 3.03 ms ± 79.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
iterrows: 4.78 ms ± 553 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
apply: 2.03 ms ± 237 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
vec_implement_pandas: 2.82 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
vec_implement_numpy: 163 µs ± 3.17 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


* DataFrame will be a better choice for visual display.

In [None]:
# dictionary to set up for dataframe
execute_times = {
    'Method': ['For Loop', 'iterrows', 'apply', 'Vectorized Pandas', 'Vectorized NumPy'],
    'Execution Time (ms)': [5.51, 5.15, 1.19, 1.37, 0.226]
}

# dataframe for better visibility
df_exec_times = pd.DataFrame(execute_times)
df_exec_times

Unnamed: 0,Method,Execution Time (ms)
0,For Loop,5.51
1,iterrows,5.15
2,apply,1.19
3,Vectorized Pandas,1.37
4,Vectorized NumPy,0.226


* The computated times are tabulated.

## Conclusion

* For-Loop: Commonly used for ordinary taks on small data, but not recommend for large datasets.
* Iterrows: Slightly faster than for-loop but not recommended as well, row by row application causing high overhead.
* Apply: Faster than iterrows and for loop with big improvement.
* Vectorized Pandas: Slightly slower than apply function, but still shows great improvement compared to iterrows and for-loop.
* Vectorized Numpy: The fastest method with most effiicient computation.