# Setting up workspace

In [51]:
import pandas as pd
import glob
import numpy as np
import datetime

We'll be using 12 different datasets, each one for each month, of data about all the `Bicing stations ACIVITY` in Barcelona from March 2019 to March 2020.
We'll also be using one dataset which contains additional information about the `Bicing Station INFORMATION`.

In [52]:
# We'll merge all datasets we have in the folder, since all of them have the same structure:

path = 'Data'
files = glob.glob(path + "/*.csv")

csvs = []

for csv in files:
    df = pd.read_csv(csv, low_memory= False)
    csvs.append(df)

df = pd.concat(csvs, axis=0, sort=False)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40947118 entries, 0 to 3518214
Data columns (total 13 columns):
station_id                              int64
num_bikes_available                     int64
num_bikes_available_types.mechanical    int64
num_bikes_available_types.ebike         int64
num_docks_available                     int64
is_installed                            int64
is_renting                              int64
is_returning                            int64
last_reported                           float64
is_charging_station                     bool
status                                  object
last_updated                            int64
ttl                                     int64
dtypes: bool(1), float64(1), int64(10), object(1)
memory usage: 4.0+ GB


# Analyzing and manipulating the dataset

We're analyzing all the information the columns of our datasets are giving us:

### Bicing Stations INFORMATION:

 - 1. `station_id`: Unique id of the Bicing station
 - 2. `name`: Address in where the station is located
 - 3. `physical_configuration`: String which defines the kind of station. All entries contain the same string, so this column is not valuable for us.
 - 4. `lat`: Latitude of the station
 - 5. `lon`: Longitude of the station
 - 6. `alt`: Altitude of the station in meters above sea level. 
 - 7. `address`: Address of the station. Since the name of the station is it's address, both columns are the same.
 - 8. `post_code`: Postal code of the district the station is located. Contains some NA's and wrong values we'll fix. We'll also substitute this number for the actual name of the district.
 - 9. `capacity`: Capacity of the station of storaging and charging bikes
 - 10. `cross_street`: Name of the street crossing the Station Name street. Not useful for us and it's mostly NA values.
 - 11. `last_updated`: Time of the lecture. Currently in Timestamp, will transform it into more usable format.
 - 12. `ttl`: Some communication protocol metric, not useful for us
 - 13. `nearby_distance`: Contains unuseful data. Mostly NA values
 - 14. `planned_date`: Contains five values. This value is the date in which a station is planned to begin working. Not useful for us. Mostly NA values.

### Counting NA's

In [54]:
def df_total_na(df):
    '''
    Returns the percentage of total NULL values in the dataset
    '''
    total_obs = df.count().sum()
    total_nas = df.isna().sum().sum()
    nas_percentage = total_nas *100 / total_obs
    
    print(f'Our dataset has {round(nas_percentage, 2)}% missing values overall' )
    
print(df_total_na(df))

Our dataset has 0.0% missing values overall
None


In [55]:
def column_nulls_percentage(df):
    '''
    Returns a series indicating percentage of NULLS per column
    '''
    # .mean() gets the NULL values and divides it by the total length of the column
    
    return df.isna().mean().round(4) * 100


print(column_nulls_percentage(df))

station_id                              0.0
num_bikes_available                     0.0
num_bikes_available_types.mechanical    0.0
num_bikes_available_types.ebike         0.0
num_docks_available                     0.0
is_installed                            0.0
is_renting                              0.0
is_returning                            0.0
last_reported                           0.0
is_charging_station                     0.0
status                                  0.0
last_updated                            0.0
ttl                                     0.0
dtype: float64


## Cleaning

In [9]:
# We'll rearranging the dataset so it's easier for us to manipulate, by dropping some columns and renaming them:

col_drops = ['physical_configuration', 'address', 'cross_street', 'nearby_distance', 'planned_date']

df.drop(col_drops, axis=1, inplace= True)

In [12]:
# We'll also store the original dataframe, rename its columns and make a reduced version of it for faster manipulation.

df_original = df[['last_updated', 'station_id', 'capacity', 'ttl', 'name', 'post_code', 'lat', 'lon', 'altitude']]

In [13]:
df_original.columns = ['time', 'id', 'capacity', 'bikes', 'address', 'postal_code', 'lat', 'lon', 'alt']

In [62]:
df.columns

Index(['station_id', 'num_bikes_available',
       'num_bikes_available_types.mechanical',
       'num_bikes_available_types.ebike', 'num_docks_available',
       'is_installed', 'is_renting', 'is_returning', 'last_reported',
       'is_charging_station', 'status', 'last_updated', 'ttl'],
      dtype='object')

In [63]:
df.head()

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,is_renting,is_returning,last_reported,is_charging_station,status,last_updated,ttl
0,1,16,16,0,14,1,1,1,1553796000.0,True,IN_SERVICE,1553795923,23
1,2,27,27,0,0,1,1,1,1553796000.0,True,IN_SERVICE,1553795923,23
2,3,20,20,0,7,1,1,1,1553796000.0,True,IN_SERVICE,1553795923,23
3,4,12,12,0,7,1,1,1,1553796000.0,True,IN_SERVICE,1553795923,23
4,5,39,39,0,0,1,1,1,1553796000.0,True,IN_SERVICE,1553795923,23


In [67]:
df[df['station_id'] == 3].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99787 entries, 2 to 3517807
Data columns (total 13 columns):
station_id                              99787 non-null int64
num_bikes_available                     99787 non-null int64
num_bikes_available_types.mechanical    99787 non-null int64
num_bikes_available_types.ebike         99787 non-null int64
num_docks_available                     99787 non-null int64
is_installed                            99787 non-null int64
is_renting                              99787 non-null int64
is_returning                            99787 non-null int64
last_reported                           99787 non-null float64
is_charging_station                     99787 non-null bool
status                                  99787 non-null object
last_updated                            99787 non-null int64
ttl                                     99787 non-null int64
dtypes: bool(1), float64(1), int64(10), object(1)
memory usage: 10.0+ MB


In [33]:
df_original['time'] = pd.to_datetime(df_original.time, unit='s')

In [41]:
df_original.groupby('time').count().head(50)

Unnamed: 0_level_0,id,capacity,bikes,address,postal_code,lat,lon,alt
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-03-28 17:58:37,399,399,399,399,399,399,399,399
2019-03-28 18:14:57,399,399,399,399,399,399,399,399
2019-03-28 18:19:50,399,399,399,399,399,399,399,399
2019-03-28 18:24:32,399,399,399,399,399,399,399,399
2019-03-28 18:29:29,399,399,399,399,399,399,399,399
2019-03-28 18:34:50,399,399,399,399,399,399,399,399
2019-03-28 18:39:43,399,399,399,399,399,399,399,399
2019-03-28 18:44:49,399,399,399,399,399,399,399,399
2019-03-28 18:49:58,399,399,399,399,399,399,399,399
2019-03-28 18:54:42,399,399,399,399,399,399,399,399


In [5]:
ratings['timestamp'] = pd.to_datetime(ratings.timestamp, unit='s')

NameError: name 'ratings' is not defined

La idea es hacer una funcion que le des un punto A, B, y una hora, y te diga lo siguiente:
- Probabilidad de poder hacer el ride
- Tiempo aproximado que vas a tardar
- Desnivel de la estación origen a estación destino.
