# Citibikes data processing

## Part 2: Monthly station use

Objectives:

- Count daily station use:
  - date
  - station id
  - start count
  - end count




In [1]:
import pandas as pd
from pathlib import Path

In [6]:
def agg_month(year, month):
    # number with year and month together
    yearmonth = (year*100) + month
    # define path to csv
    file_path = Path('Resources', f'{yearmonth}-citibike-tripdata.csv')
    # column names
    col_names = ['station_name_start', 'station_name_end', 'user_type']
    # new structure started on 202102
    new_structure = (yearmonth >= 202102)
    if new_structure:
        # what cols to use
        # 1:rideable_type,
        # 5:start_station_name,
        # 7:end_station_name,
        # 12:member_casual
        use_columns = [1, 4, 6, 12]
        col_names = ['rideable_type'] + col_names
    else:
        # 3"start station name",
        # 7"end station name",
        # 12"usertype"
        use_columns = [4, 8, 12]
    # read csv with desired columns and drop NAs
    df = pd.read_csv(
        file_path,
        header=0,
        usecols=use_columns,
        names=col_names,
        dtype='object'
    ).dropna()
    # add missing columns in old structure
    if not new_structure:
        df['rideable_type'] = 'classic_bike'
    # melt df station id column
    df = (df
        .melt(
            id_vars=['user_type', 'rideable_type'],
            var_name='start_or_end',
            value_name='station_name'
        )
    )
    df['start_or_end'] = df['start_or_end'].str[11:]

    # get counts of each combination of col values
    df = (df
        .value_counts()
        .reset_index(drop=False, name='trip_count')
    )
    # add date column as first day of month
    df['date'] = f'01-{month:02}-{year}'
    return df

In [7]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # create empty df to concat months
    df = pd.DataFrame()
    # loop through all months
    for month in range(first_month, 3):
        print(f'  Aggregating month: {month}')
        # run agg_month for each month
        month_df = agg_month(year, month)
        # concatenate with all months
        df = pd.concat(
            objs=[df, month_df],
            ignore_index=True,
            names=month_df.index.names
        )
    return df


In [8]:
# create empty df to concat months
df = pd.DataFrame()
# loop through all years
for year in range(2021, 2022):
    print(f'Aggregating year: {year}')
    # run agg_year for each year's data
    year_df = agg_year(year)
    print(f'  --\n  {len(year_df)} rows in {year}')
    # concatenate with all years
    df = pd.concat(
        objs=[df, year_df],
        ignore_index=True,
        names=year_df.index.names
    )
    print(f'  {len(df)} total rows so far\n  --')
print(f'Done! {len(df)} final rows')


Aggregating year: 2021
  Aggregating month: 1
  Aggregating month: 2
  --
  9984 rows in 2021
  9984 total rows so far
  --
Done! 9984 final rows


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9984 entries, 0 to 9983
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_type      9984 non-null   object 
 1   rideable_type  9984 non-null   object 
 2   start_or_end   9984 non-null   object 
 3   station_id     9984 non-null   object 
 4   trip_count     9984 non-null   float64
 5   date           9984 non-null   object 
dtypes: float64(1), object(5)
memory usage: 468.1+ KB


In [10]:
df.nunique()

user_type           4
rideable_type       2
start_or_end        2
station_id       2514
trip_count       1678
date                2
dtype: int64

In [17]:
df.loc[df['station_id'].str.contains('822')]

Unnamed: 0,user_type,rideable_type,start_or_end,station_id,trip_count,date
2702,Subscriber,classic_bike,start,3822.0,130.0,01-01-2021
2738,Subscriber,classic_bike,end,3822.0,124.0,01-01-2021
3223,Customer,classic_bike,end,3822.0,83.0,01-01-2021
3348,Customer,classic_bike,start,3822.0,74.0,01-01-2021
4881,member,docked_bike,start,6822.09,4179.0,01-02-2021
4882,member,docked_bike,end,6822.09,4105.0,01-02-2021
6210,casual,docked_bike,end,6822.09,241.0,01-02-2021
6423,casual,docked_bike,start,6822.09,195.0,01-02-2021
6760,member,docked_bike,end,8226.05,146.0,01-02-2021
6818,member,docked_bike,start,8226.05,137.0,01-02-2021


In [8]:
# df.to_csv('201306-202312-citibike-tripdata-station_counts.csv', index=False, header=True)

In [19]:
station_df = pd.read_csv('201306-202312-citibike-tripdata-station_info.csv', index_col='id')
station_df.head()

Unnamed: 0_level_0,name,lat,lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3020,NYCBS Test,40.646607,-74.01597
3000,MLSWKiosk,40.755467,-73.986536
404,9 Ave & W 14 St,40.740583,-74.005509
271,Ashland Pl & Hanson Pl,40.685227,-73.978001
3133,E 67 St & Lexington Ave,40.767343,-73.964859


In [21]:
joined_df = df.join(station_df, on='station_id', how='left')
joined_df.head()

Unnamed: 0,user_type,rideable_type,start_or_end,station_id,trip_count,date,name,lat,lon
0,Subscriber,classic_bike,start,3141,7028.0,01-01-2021,1 Ave & E 68 St,40.765005,-73.958185
1,Subscriber,classic_bike,end,3141,6944.0,01-01-2021,1 Ave & E 68 St,40.765005,-73.958185
2,Subscriber,classic_bike,start,435,5757.0,01-01-2021,W 21 St & 6 Ave,40.74174,-73.994156
3,Subscriber,classic_bike,end,435,5739.0,01-01-2021,W 21 St & 6 Ave,40.74174,-73.994156
4,Subscriber,classic_bike,end,497,5259.0,01-01-2021,E 17 St & Broadway,40.73705,-73.990093


In [25]:
joined_df.sort_values(['name', 'date']).head(20)

Unnamed: 0,user_type,rideable_type,start_or_end,station_id,trip_count,date,name,lat,lon
826,Subscriber,classic_bike,start,3496.0,807.0,01-01-2021,1 Ave & E 110 St,40.792327,-73.9383
834,Subscriber,classic_bike,end,3496.0,787.0,01-01-2021,1 Ave & E 110 St,40.792327,-73.9383
2523,Customer,classic_bike,end,3496.0,151.0,01-01-2021,1 Ave & E 110 St,40.792327,-73.9383
2690,Customer,classic_bike,start,3496.0,131.0,01-01-2021,1 Ave & E 110 St,40.792327,-73.9383
5613,member,docked_bike,end,7522.02,534.0,01-02-2021,1 Ave & E 110 St,40.792282,-73.938097
5672,member,docked_bike,start,7522.02,485.0,01-02-2021,1 Ave & E 110 St,40.792282,-73.938097
7366,casual,docked_bike,end,7522.02,91.0,01-02-2021,1 Ave & E 110 St,40.792282,-73.938097
7378,casual,docked_bike,start,7522.02,91.0,01-02-2021,1 Ave & E 110 St,40.792282,-73.938097
28,Subscriber,classic_bike,end,504.0,3552.0,01-01-2021,1 Ave & E 16 St,40.732219,-73.981656
29,Subscriber,classic_bike,start,504.0,3496.0,01-01-2021,1 Ave & E 16 St,40.732219,-73.981656
