## Citibikes data processing v2

**Notes:**

2013 data until now

Dataset 1 -- is bikeshare more popular from 2013 until now
- year
- month
- rideable type
- member_casual
- binned trip duration in mins (calculate in python)
- binned trip distance in meters (calculate in python)
- count(ride_id)
  


In [7]:
import pandas as pd
from pathlib import Path
import datetime

In [46]:
def get_tripduration(started_at, ended_at):
        # started_at = pd.to_datetime(started_at, yearfirst=True)
        # ended_at = pd.to_datetime(ended_at, yearfirst=True)
        tripduration = (ended_at - started_at) / pd.Timedelta(seconds=1)
        return tripduration.astype('int32')


In [81]:
def agg_month(year, month):
    # define path to csv
    file_path = Path('Resources', f'{year}{month:02}-citibike-tripdata.csv')
    new_structure = (year >= 2021 and month >= 2)
    if new_structure:
        use_columns = [1, 2, 3, 4, 6, 12]
        start_col = 'started_at'
        end_col = 'ended_at'
    else:
        use_columns = [0, 1, 2, 12]
        start_col = 'starttime'
        end_col = 'stoptime'

    df = pd.read_csv(
        file_path, 
        usecols=use_columns
    ).dropna()
    df[start_col] = pd.to_datetime(df[start_col], yearfirst=True)
    df[end_col] = pd.to_datetime(df[end_col], yearfirst=True)
    
    if new_structure:
        df['trip_duration'] = get_tripduration(df[start_col], df[end_col])
        df.rename(columns={'member_casual': 'user_type'}, inplace=True)
    else:
        df['rideable_type'] = 'classic_bike'
        df.rename(
            columns={
                'usertype': 'user_type',
                'tripduration': 'trip_duration'
            },
            inplace=True
        )
    df = df.loc[df['trip_duration'] > 0]
    df['trip_duration'] = df['trip_duration'] / 60
    df['date'] = df[start_col].dt.date

    # df = df[['date', 'trip_duration', 'user_type', 'rideable_type']]

    # df['duration_bin'] = pd.cut(
    #     df['trip_duration'],
    #     bins= 5
    # )
    
    # df = (df
    #     .groupby([
    #         'usertype',
    #         'rideable_type'
    #     ], as_index=False, sort=False)
    #     .value_counts()
    # )
    return df

In [131]:
test_df = agg_month(2018, 11)

In [132]:
test_df.loc[test_df['trip_duration'] > 1440]

Unnamed: 0,trip_duration,starttime,stoptime,user_type,rideable_type,date
11294,5648.216667,2018-11-01 08:32:54.001,2018-11-05 05:41:07.922,Subscriber,classic_bike,2018-11-01
23180,1904.866667,2018-11-01 11:11:07.263,2018-11-02 18:55:59.659,Customer,classic_bike,2018-11-01
24960,5589.633333,2018-11-01 11:47:32.794,2018-11-05 07:57:10.993,Subscriber,classic_bike,2018-11-01
26790,4192.366667,2018-11-01 12:20:42.564,2018-11-04 09:13:04.618,Customer,classic_bike,2018-11-01
29712,18864.050000,2018-11-01 13:11:32.966,2018-11-14 14:35:36.603,Subscriber,classic_bike,2018-11-01
...,...,...,...,...,...,...
1246447,1945.833333,2018-11-30 17:11:12.082,2018-12-02 01:37:02.511,Subscriber,classic_bike,2018-11-30
1246808,1449.333333,2018-11-30 17:16:15.195,2018-12-01 17:25:35.288,Customer,classic_bike,2018-11-30
1251977,2888.816667,2018-11-30 18:40:25.989,2018-12-02 18:49:15.545,Subscriber,classic_bike,2018-11-30
1254450,3991.950000,2018-11-30 19:31:34.951,2018-12-03 14:03:32.830,Subscriber,classic_bike,2018-11-30


In [144]:
pd.cut(
        test_df['trip_duration'],
        bins= (
            list(range(0, 60, 10)) +
            list(range(60, 1440, 60)) +
            list(range(1440, (1440*8), 1440))
        ),
        right=False, include_lowest=True
    ).value_counts(sort=False, dropna=False)

[0.0, 10.0)          675391
[10.0, 20.0)         371936
[20.0, 30.0)         132037
[30.0, 40.0)          50429
[40.0, 50.0)          16977
[50.0, 60.0)           4634
[60.0, 120.0)          6038
[120.0, 180.0)         1040
[180.0, 240.0)          481
[240.0, 300.0)          243
[300.0, 360.0)          148
[360.0, 420.0)          112
[420.0, 480.0)           73
[480.0, 540.0)           68
[540.0, 600.0)           56
[600.0, 660.0)           47
[660.0, 720.0)           33
[720.0, 780.0)           36
[780.0, 840.0)           40
[840.0, 900.0)           37
[900.0, 960.0)           29
[960.0, 1020.0)          21
[1020.0, 1080.0)         18
[1080.0, 1140.0)         26
[1140.0, 1200.0)         17
[1200.0, 1260.0)         28
[1260.0, 1320.0)         22
[1320.0, 1380.0)         17
[1380.0, 1440.0)         22
[1440.0, 2880.0)        150
[2880.0, 4320.0)         54
[4320.0, 5760.0)         28
[5760.0, 7200.0)         19
[7200.0, 8640.0)          9
[8640.0, 10080.0)        14
NaN                 

In [None]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # loop through all months
    for month in range(first_month, 8):
        # run agg_month for each month
        month_df = agg_month(year, month)


In [4]:
# loop through all years
for year in range(2013, 2014):
    # run agg_year for each year's data
    year_df = agg_year(year)


tripduration
tripduration


In [None]:
# 202101: last month with 'tripduration' data structure

In [5]:
working_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843416 entries, 0 to 843415
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tripduration             843416 non-null  int64  
 1   starttime                843416 non-null  object 
 2   stoptime                 843416 non-null  object 
 3   start station id         843416 non-null  int64  
 4   start station name       843416 non-null  object 
 5   start station latitude   843416 non-null  float64
 6   start station longitude  843416 non-null  float64
 7   end station id           843416 non-null  int64  
 8   end station name         843416 non-null  object 
 9   end station latitude     843416 non-null  float64
 10  end station longitude    843416 non-null  float64
 11  bikeid                   843416 non-null  int64  
 12  usertype                 843416 non-null  object 
 13  birth year               843416 non-null  object 
 14  gend