# **<u>Cleaning the velib emplacement des stations data:</u>**

In [21]:
import pandas as pd

## **<u>Loading the data using Pandas:</u>**

In [22]:
df = pd.read_csv('../external_data/velib-emplacement-des-stations.csv', sep=';')

In [23]:
df.shape

(1468, 4)

In [24]:
df.head()

Unnamed: 0,Identifiant station,Nom de la station,Capacité de la station,Coordonnées géographiques
0,17026,Jouffroy d'Abbans - Wagram,40,"48.881973298352, 2.301132157445"
1,5016,Thouin - Cardinal Lemoine,17,"48.84504716661511, 2.3494647851273465"
2,15068,Général Martial Valin - Pont du Garigliano,16,"48.83823094269141, 2.270506024360657"
3,10115,Granges aux Belles,27,"48.8761373390584, 2.3680844979417"
4,10029,Dunkerque - Rocroy,24,"48.880726, 2.351464"


## **<u>Checking for null values:</u>**

Overall null values:

In [25]:
df.isna().sum().sort_values(ascending=False)

Identifiant station          0
Nom de la station            0
Capacité de la station       0
Coordonnées géographiques    0
dtype: int64

-> From the above cell, we notice that the dataset is clean regarding null values, what has to be done now is a general analysis to decide on the relevance of certain features to our analysis

## **<u>Checking for the relevance of the columns:</u>**

In [26]:
df.head()

Unnamed: 0,Identifiant station,Nom de la station,Capacité de la station,Coordonnées géographiques
0,17026,Jouffroy d'Abbans - Wagram,40,"48.881973298352, 2.301132157445"
1,5016,Thouin - Cardinal Lemoine,17,"48.84504716661511, 2.3494647851273465"
2,15068,Général Martial Valin - Pont du Garigliano,16,"48.83823094269141, 2.270506024360657"
3,10115,Granges aux Belles,27,"48.8761373390584, 2.3680844979417"
4,10029,Dunkerque - Rocroy,24,"48.880726, 2.351464"


In [27]:
df = df.drop(columns=[
    'Identifiant station',
    'Nom de la station'
])

In [28]:
df.head()

Unnamed: 0,Capacité de la station,Coordonnées géographiques
0,40,"48.881973298352, 2.301132157445"
1,17,"48.84504716661511, 2.3494647851273465"
2,16,"48.83823094269141, 2.270506024360657"
3,27,"48.8761373390584, 2.3680844979417"
4,24,"48.880726, 2.351464"


## **<u>Modifying the remaining columns:</u>**

First we remove the Coordonnées géographiques column and replace it with latitude and longitude columns:

In [29]:
df[['latitude', 'longitude']] = df['Coordonnées géographiques'].str.split(',', expand=True)

df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

df = df.drop(columns=['Coordonnées géographiques'])

In [30]:
df.head()

Unnamed: 0,Capacité de la station,latitude,longitude
0,40,48.881973,2.301132
1,17,48.845047,2.349465
2,16,48.838231,2.270506
3,27,48.876137,2.368084
4,24,48.880726,2.351464


## **<u>Creating a csv file with the cleaned data:</u>**

In [31]:
df.to_csv("info_velib_v1.csv", index=False)

## **<u>Merging the two datasets:</u>**

In [11]:
df_original = pd.read_csv('../../../df_original.csv')

In [38]:
from sklearn.neighbors import BallTree
import numpy as np


# Convert lat/lon to radians for haversine distance calculation
lat_lon_stations = np.deg2rad(df[['latitude', 'longitude']].values)
lat_lon_original = np.deg2rad(df_original[['latitude', 'longitude']].values)

# Create a BallTree with station coordinates
tree = BallTree(lat_lon_stations, metric='haversine')

# Define your search radius in meters and convert to radians (Earth radius is approximately 6371 km)
radius = 200 / 6371000  # Example radius of 500 meters

# Query the tree for stations within the radius for each point in df_original
indices = tree.query_radius(lat_lon_original, r=radius)

# Calculate the sum of capacities for stations within the radius for each site in df_original
df_original['total_nearby_station_capacity'] = [df.iloc[index]['Capacité de la station'].sum() for index in indices]
df_original['number_of_nearby_stations'] = [len(index) for index in indices]


In [39]:
df_original

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,total_nearby_station_capacity,number_of_nearby_stations
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,99,2
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,99,2
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,99,2
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,99,2
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,99,2
...,...,...,...,...,...,...,...,...,...,...,...,...
496822,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 06:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,102,2
496823,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 10:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,102,2
496824,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,102,2
496825,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 22:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,102,2


In [40]:
df_original.nunique()

counter_id                         56
counter_name                       56
site_id                            30
site_name                          30
date                             8974
counter_installation_date          22
coordinates                        30
counter_technical_id               30
latitude                           30
longitude                          30
total_nearby_station_capacity      21
number_of_nearby_stations           6
dtype: int64