## Citibikes data processing v2

**Notes:**

2013 data until now

Dataset 1 -- is bikeshare more popular from 2013 until now
- year
- month
- rideable type
- member_casual
- binned trip duration in mins (calculate in python)
- binned trip distance in meters (calculate in python)
- count(ride_id)
  


In [2]:
import pandas as pd
from pathlib import Path
import datetime

In [3]:
def get_tripduration(started_at, ended_at):
        # started_at = pd.to_datetime(started_at, yearfirst=True)
        # ended_at = pd.to_datetime(ended_at, yearfirst=True)
        tripduration = (ended_at - started_at) / pd.Timedelta(seconds=1)
        return tripduration.astype('int32')


In [29]:
def bin_tripduration(trip_duration_col):
    trip_duration_bins = (pd
        .cut(
            trip_duration_col,
            bins=(
                # first hour divided by 10 min
                list(range(0, 60, 10)) +
                # first day divided by hour
                list(range(60, 1440, 60)) +
                # days division up to days 7 days
                list(range(1440, (1440*7)+1, 1440)) +
                # over 7 days, up to a year, are the last bin
                [(1440*366)+1]
            ),
            # labels are the bottom limit of each bin
            labels=(
                # first hour divided by 10 min
                list(range(0, 60, 10)) +
                # first day divided by hour
                list(range(60, 1440, 60)) +
                # days division up to days 7 days
                list(range(1440, (1440*7)+1, 1440))
            ),
            right=False,
            include_lowest=True
        )
        # fill category for values over a year
        .fillna(1440*366)
    )
    return trip_duration_bins

In [54]:
def agg_month(year, month):
    # define path to csv
    file_path = Path('Resources', f'{year}{month:02}-citibike-tripdata.csv')
    new_structure = (year >= 2021 and month >= 2)
    if new_structure:
        # what cols to use
        # note: start and end station names are 4 and 6
        use_columns = [1, 2, 3, 12]
        start_col = 'started_at'
        end_col = 'ended_at'
    else:
        use_columns = [0, 1, 2, 12]
        start_col = 'starttime'
        end_col = 'stoptime'

    df = pd.read_csv(
        file_path, 
        usecols=use_columns
    ).dropna()
    df[start_col] = pd.to_datetime(df[start_col], yearfirst=True)
    df[end_col] = pd.to_datetime(df[end_col], yearfirst=True)
    
    if new_structure:
        df['trip_duration'] = get_tripduration(df[start_col], df[end_col])
        df.rename(columns={'member_casual': 'user_type'}, inplace=True)
    else:
        df['rideable_type'] = 'classic_bike'
        df.rename(
            columns={
                'usertype': 'user_type',
                'tripduration': 'trip_duration'
            },
            inplace=True
        )
    # only keep trip_durations that are over 0 (neg values are errors)
    df = df.loc[df['trip_duration'] > 0]
    # convert seconds to minutes
    df['trip_duration'] = df['trip_duration'] / 60
    # get date from start column
    df['date'] = df[start_col].dt.date
    # drop start and end columns
    df = df.drop(columns=[start_col, end_col])
    # convert trip_duration to binned categories
    df['trip_duration'] = bin_tripduration(df['trip_duration'])
    # get counts of each combination of col values
    df = (df
        .value_counts()
        .reset_index(drop=False, name='trip_count')
    )
    return df

In [80]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # create empty df to concat months
    df = pd.DataFrame()
    # loop through all months
    for month in range(first_month, 3):
        # run agg_month for each month
        month_df = agg_month(year, month)
        # print(month_df.head())
        # concatenate month_df with total df
        df = pd.concat([df, month_df], names=month_df.index.names)
        print(df.head())
        # print(month_df.index.names)
    # return df


In [81]:
# loop through all years
for year in range(2021, 2022):
    # run agg_year for each year's data
    year_df = agg_year(year)


                                                 0
(0, Subscriber, classic_bike, 2021-01-14)  22306.0
(0, Subscriber, classic_bike, 2021-01-22)  20746.0
(0, Subscriber, classic_bike, 2021-01-13)  20460.0
(0, Subscriber, classic_bike, 2021-01-12)  20239.0
(0, Subscriber, classic_bike, 2021-01-19)  19975.0
                                                 0
(0, Subscriber, classic_bike, 2021-01-14)  22306.0
(0, Subscriber, classic_bike, 2021-01-22)  20746.0
(0, Subscriber, classic_bike, 2021-01-13)  20460.0
(0, Subscriber, classic_bike, 2021-01-12)  20239.0
(0, Subscriber, classic_bike, 2021-01-19)  19975.0


In [55]:
test_df = agg_month(2021, 11)

In [None]:
# 202101: last month with 'tripduration' data structure

In [5]:
working_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843416 entries, 0 to 843415
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tripduration             843416 non-null  int64  
 1   starttime                843416 non-null  object 
 2   stoptime                 843416 non-null  object 
 3   start station id         843416 non-null  int64  
 4   start station name       843416 non-null  object 
 5   start station latitude   843416 non-null  float64
 6   start station longitude  843416 non-null  float64
 7   end station id           843416 non-null  int64  
 8   end station name         843416 non-null  object 
 9   end station latitude     843416 non-null  float64
 10  end station longitude    843416 non-null  float64
 11  bikeid                   843416 non-null  int64  
 12  usertype                 843416 non-null  object 
 13  birth year               843416 non-null  object 
 14  gend