##### Week3 Assignment
##### Author:Wenshan, Liu
##### Date: 2025-02-09



In [47]:
import numpy as np
import time
from scipy.spatial.distance import cdist
import pandas as pd
import cProfile


In [48]:
# load data into dataframe
df_cleaned = pd.read_csv("clinics.csv", sep="|")
print(df_cleaned.head())

   bizID   bizCat                  bizCatSub  \
0      1  Clinics                    Clinics   
1      2  Clinics                    Clinics   
2      3  Clinics  Clinics & Medical Centers   
3      4  Clinics  Clinics & Medical Centers   
4      5  Clinics  Clinics & Medical Centers   

                                     bizName  \
0                           Hino Ronald H MD   
1                         Farmer Joesph F Md   
2                             Najjar Fadi Md   
3  Kittson Memorial Upper Level Nursing Home   
4                       Thompson Robert B Md   

                             bizAddr      bizCity bizState  bizZip  \
0  98-151 Pali Momi Street Suite 142         Aiea       HI   96701   
1            1225 Breckenridge Drive  Little Rock       AR   72205   
2     1155 West Linda Avenue Suite B    Hermiston       OR   97838   
3            1010 South Birch Avenue      Hallock       MN   56728   
4        100 North Eagle Creek Drive    Lexington       KY   40509   

 

In [49]:
# get latitudes and longitudes and drop na values
latitudes = df_cleaned["locLat"].dropna().values
longitudes = df_cleaned["locLong"].dropna().values
coordinates = np.column_stack((latitudes, longitudes))



In [50]:
# to create a distance function

def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959  # to set the radius of the earth in miles
    
    # convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])

    # calculate the difference in latitude and longitude
    dlat = lat2 - lat1 
    dlon = lon2 - lon1  

    # apply the haversine formula
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # calculate the distance in miles
    total_miles = MILES * c
    return total_miles

In [51]:

# Function to compute pairwise Euclidean distance using a for-loop
def haversine_looping_all(df):
    n = len(df)
    distance_matrix = np.zeros((n, n))  
    
    for i in range(n): # to iterate through the rows of the dataframe
        for j in range(i + 1, n):  
            d = haversine(coordinates[i][0], coordinates[i][1],
                         coordinates[j][0], coordinates[j][1])
            distance_matrix[i][j] = d
            distance_matrix[j][i] = d  # since the distance matrix is symmetric, copy to (j, i)

    
    return distance_matrix


# 1. Measure execution time for for-loop approach
start_time = time.time()
distances_loop = haversine_looping_all(coordinates)
time_loop = time.time() - start_time

print(time_loop)



"""
# Measure execution time for Scipy's cdist function
start_time = time.time()
distances_scipy = cdist(coordinates, coordinates, metric='euclidean')
time_scipy = time.time() - start_time

print(time_scipy)
"""






0.0030770301818847656


"\n# Measure execution time for Scipy's cdist function\nstart_time = time.time()\ndistances_scipy = cdist(coordinates, coordinates, metric='euclidean')\ntime_scipy = time.time() - start_time\n\nprint(time_scipy)\n"

In [52]:
# 2. Measure execution time for NumPy vectorized approach
# Measure execution time for NumPy vectorized approach
def haversine_numpy(coordinates):
    start_time = time.time()
    # to convert the coordinates to column vectors
    lat1 = np.column_stack((coordinates[:, 0]))[:, np.newaxis]
    lon1 = np.column_stack((coordinates[:, 1]))[:, np.newaxis]
    lat2 = np.column_stack((coordinates[:, 0]))[np.newaxis, :]
    lon2 = np.column_stack((coordinates[:, 1]))[np.newaxis, :]

    distances_numpy = haversine(lat1, lon1, lat2, lon2)


    time_numpy = time.time() - start_time
    return  distances_numpy, time_numpy

distances_numpy, time_numpy = haversine_numpy(coordinates)
print(time_numpy)

0.00018835067749023438


In [53]:
#3. Measure execution time for Iterrows
def haversine_iterrows(df_cleaned):
    start_time = time.time()
    distances_iterrows = []
    
    # to iterate through the rows of the dataframe
    for index, row in df_cleaned.iterrows():
        distances_iterrows.append(
            haversine(
                row['locLat'], 
                row['locLong'], 
                df_cleaned.loc[index, 'locLat'], 
                df_cleaned.loc[index, 'locLong']
            )
        )
    
    time_iterrows = time.time() - start_time
    return distances_iterrows, time_iterrows

# 調用函數
distances_iterrows, time_iterrows = haversine_iterrows(df_cleaned)
print(time_iterrows)


0.0011050701141357422


In [54]:
#4.# Measure execution time for Apply


def haversine_apply(df_cleaned):
    start_time = time.time()
    
    # to use the apply method to calculate the distance
    distances_apply = df_cleaned.apply(
        lambda row: haversine(
            row['locLat'], 
            row['locLong'], 
            df_cleaned.loc[row.name, 'locLat'], 
            df_cleaned.loc[row.name, 'locLong']
        ), 
        axis=1
    )
    
    time_apply = time.time() - start_time
    return distances_apply, time_apply

# to call the function
distances_apply, time_apply = haversine_apply(df_cleaned)
print(time_apply)



0.0007009506225585938


In [55]:
#5. Measure execution time for Vectorized functions in Pandas


def haversine_pandas_vectorized(df_cleaned):
    start_time = time.time()
    
    # to convert the locLat and locLong to column vectors
    lat1 = df_cleaned['locLat'].values[:, None]  
    lon1 = df_cleaned['locLong'].values[:, None] 
    lat2 = df_cleaned['locLat'].values[None, :] 
    lon2 = df_cleaned['locLong'].values[None, :]  
    
    # to calculate the distance matrix
    distances_vectorized = haversine(lat1, lon1, lat2, lon2)
    
    time_vectorized = time.time() - start_time
    return distances_vectorized, time_vectorized

# to call the function
distances_vectorized, time_vectorized = haversine_pandas_vectorized(df_cleaned)
print(time_vectorized)


0.0003142356872558594


In [56]:

# cProfile to analyze the performance of the for-loop approach
print("\n=== For-loop Approach Performance Analysis ===")
cProfile.run("haversine_looping_all(coordinates)")
print(f"For-loop execution time: {time_loop:.6f} seconds")





=== For-loop Approach Performance Analysis ===
         441 function calls in 0.002 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.002    0.002 2830128831.py:2(haversine_looping_all)
      435    0.002    0.000    0.002    0.000 556031297.py:3(haversine)
        1    0.000    0.000    0.002    0.002 <string>:1(<module>)
        1    0.000    0.000    0.002    0.002 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.len}
        1    0.000    0.000    0.000    0.000 {built-in method numpy.zeros}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


For-loop execution time: 0.003077 seconds


In [57]:
# 2. NumPy vectorized approach analysis
print("\n=== NumPy Vectorized Approach Performance Analysis ===")
cProfile.run("haversine_numpy(coordinates)")
print(f"NumPy vectorized execution time: {time_numpy:.6f} seconds")




=== NumPy Vectorized Approach Performance Analysis ===
         387 function calls in 0.001 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.001    0.001 4027235859.py:3(haversine_numpy)
        1    0.001    0.001    0.001    0.001 556031297.py:3(haversine)
        1    0.000    0.000    0.001    0.001 <string>:1(<module>)
        4    0.000    0.000    0.000    0.000 _shape_base_impl.py:629(_column_stack_dispatcher)
        4    0.000    0.000    0.000    0.000 _shape_base_impl.py:633(column_stack)
        4    0.000    0.000    0.000    0.000 multiarray.py:161(concatenate)
        4    0.000    0.000    0.000    0.000 shape_base.py:209(_arrays_for_stack_dispatcher)
        1    0.000    0.000    0.001    0.001 {built-in method builtins.exec}
        4    0.000    0.000    0.000    0.000 {built-in method builtins.hasattr}
      120    0.000    0.000    0.000    0.000 {built-in method numpy

In [58]:
# 3. Iterrows approach analysis
print("\n=== Iterrows Approach Performance Analysis ===")
cProfile.run("haversine_iterrows(df_cleaned)")
print(f"Iterrows execution time: {time_iterrows:.6f} seconds")




=== Iterrows Approach Performance Analysis ===
         7457 function calls (7333 primitive calls) in 0.002 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.002    0.002 1192267385.py:2(haversine_iterrows)
       30    0.000    0.000    0.000    0.000 556031297.py:3(haversine)
        1    0.000    0.000    0.002    0.002 <string>:1(<module>)
       91    0.000    0.000    0.000    0.000 __init__.py:34(using_copy_on_write)
       60    0.000    0.000    0.000    0.000 __init__.py:42(warn_copy_on_write)
       60    0.000    0.000    0.000    0.000 base.py:3777(get_loc)
        1    0.000    0.000    0.000    0.000 base.py:378(interleaved_dtype)
        1    0.000    0.000    0.000    0.000 base.py:397(ensure_np_dtype)
      120    0.000    0.000    0.000    0.000 base.py:6312(_index_as_unique)
       60    0.000    0.000    0.000    0.000 base.py:6672(_maybe_cast_indexer)
       60    0.000 

In [59]:

# 4. Apply approach analysis
print("\n=== Apply Approach Performance Analysis ===")
cProfile.run("haversine_apply(df_cleaned)")
print(f"Apply execution time: {time_apply:.6f} seconds")



=== Apply Approach Performance Analysis ===
         4843 function calls (4825 primitive calls) in 0.005 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.005    0.005 3676287989.py:4(haversine_apply)
       30    0.000    0.000    0.003    0.000 3676287989.py:9(<lambda>)
       30    0.000    0.000    0.000    0.000 556031297.py:3(haversine)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1053(_handle_fromlist)
        1    0.000    0.000    0.005    0.005 <string>:1(<module>)
       60    0.000    0.000    0.000    0.000 __init__.py:34(using_copy_on_write)
       60    0.000    0.000    0.000    0.000 __init__.py:42(warn_copy_on_write)
        4    0.000    0.000    0.000    0.000 abc.py:117(__instancecheck__)
        1    0.000    0.000    0.005    0.005 apply.py:1061(apply_standard)
        1    0.000    0.000    0.004    0.004 apply.py:1070(apply_series_genera

In [60]:

# 5. Pandas vectorized approach analysis
print("\n=== Pandas Vectorized Approach Performance Analysis ===")
cProfile.run("haversine_pandas_vectorized(df_cleaned)")
print(f"Pandas vectorized execution time: {time_vectorized:.6f} seconds")


=== Pandas Vectorized Approach Performance Analysis ===
         107 function calls in 0.000 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 3918841675.py:4(haversine_pandas_vectorized)
        1    0.000    0.000    0.000    0.000 556031297.py:3(haversine)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        8    0.000    0.000    0.000    0.000 __init__.py:34(using_copy_on_write)
        4    0.000    0.000    0.000    0.000 __init__.py:42(warn_copy_on_write)
        4    0.000    0.000    0.000    0.000 base.py:5323(__contains__)
        4    0.000    0.000    0.000    0.000 blocks.py:249(external_values)
        4    0.000    0.000    0.000    0.000 blocks.py:2827(external_values)
        4    0.000    0.000    0.000    0.000 common.py:372(apply_if_callable)
        4    0.000    0.000    0.000    0.000 frame.py:4062(__getitem__)
        4    0.000    

In [66]:
# Tabulate results
execution_times = pd.DataFrame({
    "Method": ["For-loop", "NumPy Vectorized", "Iterrows", "Apply", "Pandas Vectorized"],
    "Execution Time (seconds)": [time_loop, time_numpy, time_iterrows, time_apply, time_vectorized]
})

# Display results
#print(execution_times)

print("----------------------------------------------------------------------------------------")


# sort the execution time
execution_times = execution_times.sort_values("Execution Time (seconds)")

# Display results  after sorting
print(execution_times)

----------------------------------------------------------------------------------------
              Method  Execution Time (seconds)
1   NumPy Vectorized                  0.000188
4  Pandas Vectorized                  0.000314
3              Apply                  0.000701
2           Iterrows                  0.001105
0           For-loop                  0.003077
