## Citibikes data processing v2

**Notes:**

2013 data until now

Dataset 1 -- is bikeshare more popular from 2013 until now
- year
- month
- rideable type
- member_casual
- binned trip duration in mins (calculate in python)
- binned trip distance in meters (calculate in python)
- count(ride_id)
  


In [20]:
import pandas as pd
from pathlib import Path
# import datetime

In [2]:
def get_tripduration(started_at, ended_at):
        # started_at = pd.to_datetime(started_at, yearfirst=True)
        # ended_at = pd.to_datetime(ended_at, yearfirst=True)
        tripduration = (ended_at - started_at) / pd.Timedelta(seconds=1)
        return tripduration.astype('int32')


In [3]:
def bin_tripduration(trip_duration_col):
    trip_duration_bins = (pd
        .cut(
            trip_duration_col,
            bins=(
                # first hour divided by 10 min
                list(range(0, 60, 10)) +
                # first day divided by hour
                list(range(60, 1440, 60)) +
                # days division up to days 7 days
                list(range(1440, (1440*7)+1, 1440)) +
                # over 7 days, up to a year, are the last bin
                [(1440*366)+1]
            ),
            # labels are the bottom limit of each bin
            labels=(
                # first hour divided by 10 min
                list(range(0, 60, 10)) +
                # first day divided by hour
                list(range(60, 1440, 60)) +
                # days division up to days 7 days
                list(range(1440, (1440*7)+1, 1440))
            ),
            right=False,
            include_lowest=True
        )
        # fill category for values over a year
        .fillna(1440*366)
    )
    return trip_duration_bins

In [36]:
def agg_month(year, month):
    # number with year and month together
    yearmonth = (year*100) + month
    # define path to csv
    file_path = Path('Resources', f'{yearmonth}-citibike-tripdata.csv')
    # new structure started on 202102
    new_structure = (yearmonth >= 202102)
    if new_structure:
        # what cols to use
        # note: start and end station names are 4 and 6
        use_columns = [1, 2, 3, 12]
        start_col = 'started_at'
        end_col = 'ended_at'
    else:
        use_columns = [0, 1, 2, 12]
        start_col = 'starttime'
        end_col = 'stoptime'
    # read csv with desired columns and drop NAs
    df = pd.read_csv(
        file_path, 
        usecols=use_columns
    ).dropna()
    print('\n---')
    print(df.head())
    # date formatis different btw 201409 and 201609
    if (yearmonth >= 201409 and yearmonth <= 201609):
        date_format = '%m/%d/%Y %H:%M:%S'
    else:
        date_format = '%Y-%m-%d %H:%M:%S'
    # convert to date format
    for col in [start_col, end_col]:
        df[col] = pd.to_datetime(
            df[col],
            format=date_format
        )
    print('\n---')
    print(df.head())
    # calc missing columns for each structure type 
    # and homogenize names
    if new_structure:
        df['trip_duration'] = get_tripduration(df[start_col], df[end_col])
        df.rename(columns={'member_casual': 'user_type'}, inplace=True)
    else:
        df['rideable_type'] = 'classic_bike'
        df.rename(
            columns={
                'usertype': 'user_type',
                'tripduration': 'trip_duration'
            },
            inplace=True
        )
    # only keep trip_durations that are over 0
    df = df.loc[df['trip_duration'] > 0]
    # convert from seconds to minutes
    df['trip_duration'] = df['trip_duration'] / 60
    # get date and week number from start column
    df['date'] = df[start_col].dt.date
    df['week'] = df[start_col].dt.isocalendar().week
    print('\n---')
    print(df.head())
    # drop start and end columns
    df = df.drop(columns=[start_col, end_col])
    # convert trip_duration to binned categories
    df['trip_duration'] = bin_tripduration(df['trip_duration'])
    print('\n---')
    print(df.head())
    # get counts of each combination of col values
    df = (df
        .value_counts()
        .reset_index(drop=False, name='trip_count')
    )
    return df

In [5]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # create empty df to concat months
    df = pd.DataFrame()
    # loop through all months
    for month in range(first_month, 13):
        print(f'  Aggregating month: {month}')
        # run agg_month for each month
        month_df = agg_month(year, month)
        # concatenate with all months
        df = pd.concat(
            objs=[df, month_df],
            ignore_index=True,
            names=month_df.index.names
        )
    return df


In [27]:
# create empty df to concat months
df = pd.DataFrame()
# loop through all years
for year in range(2013, 2015):
    print(f'Aggregating year: {year}')
    # run agg_year for each year's data
    year_df = agg_year(year)
    print(f'  --\n  {len(year_df)} rows in {year}')
    # concatenate with all years
    df = pd.concat(
        objs=[df, year_df],
        ignore_index=True,
        names=year_df.index.names
    )
    print(f'  {len(df)} total rows so far\n  --')
print(f'Done! {len(df)} final rows')


Aggregating year: 2013
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  --
  3638 rows in 2013
  3638 total rows so far
Aggregating year: 2014
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  --
  5428 rows in 2014
  9066 total rows so far
Done! 9066 final rows
