In [25]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

column_names = ['longitude', 'latitude', 'station_name']  # Replace with your actual column names
idf_stations = pd.read_csv(Path("data") / "Stations_IDF.csv", delimiter=';', header=None, names=column_names)
bike_data = pd.read_parquet(Path("data") / "train.parquet")

In [43]:
idf_stations['station_name'].unique().shape

(619,)

This is a dataset containing the location of 619 stations in the Ile-De-France region. It is going to be used to define three functions that will be used to : 

- GetDistanceToClosestStation
- GetDistanceToSecondClosestStation
- GetNumberOfStationsIn.5KMSQ

In [44]:
import haversine as hs

def GetDistanceToKClosest(X, idf_stations, k=1): 
    # Create an empty DataFrame to store the results
    result_df = pd.DataFrame(columns=['counter_id', 'closest_metro_distance'])

    # Iterate over unique counter_ids in X
    for counter_id in X['counter_id'].unique():
        coordinates_counter = (X.loc[X['counter_id'] == counter_id, 'latitude'].values[0],
                               X.loc[X['counter_id'] == counter_id, 'longitude'].values[0])

        # Calculate distances to all metro stations
        distances = []
        for _, station_row in idf_stations.iterrows():
            coordinates_station = (station_row['latitude'], station_row['longitude'])
            distance = hs.haversine(coordinates_counter, coordinates_station)
            distances.append(distance)

        # Get the k closest distances
        closest_distance = sorted(distances)[k-1]

        # Append to the result DataFrame
        result_df = pd.concat([result_df, pd.DataFrame({'counter_id': [counter_id], 'closest_metro_distance': [closest_distance]})])

    return result_df

In [48]:
def CountStationsWithinRadius(X, idf_stations, radius=0.4): 
    # Create an empty DataFrame to store the results
    result_df = pd.DataFrame(columns=['counter_id', 'num_stations_within_radius'])

    # Iterate over unique counter_ids in X
    for counter_id in X['counter_id'].unique():
        coordinates_counter = (X.loc[X['counter_id'] == counter_id, 'latitude'].values[0],
                               X.loc[X['counter_id'] == counter_id, 'longitude'].values[0])

        # Count stations within the given radius
        num_stations_within_radius = 0
        for _, station_row in idf_stations.iterrows():
            coordinates_station = (station_row['latitude'], station_row['longitude'])
            distance = hs.haversine(coordinates_counter, coordinates_station)

            if distance <= radius:
                num_stations_within_radius += 1

        # Append to the result DataFrame
        result_df = pd.concat([result_df, pd.DataFrame({'counter_id': [counter_id], 'num_stations_within_radius': [num_stations_within_radius]})])

    return result_df

In [49]:
df = CountStationsWithinRadius(bike_data, idf_stations)
df

Unnamed: 0,counter_id,num_stations_within_radius
0,100007049-102007049,1
0,100007049-101007049,1
0,100036718-104036718,1
0,100036718-103036718,1
0,100036719-104036719,3
0,100036719-103036719,3
0,100042374-110042374,4
0,100042374-109042374,4
0,100044493-SC,4
0,100047542-103047542,1
