# Citibikes data processing

## Part 2: Monthly station use

Objectives:

- Count daily station use:
  - date
  - station id
  - start count
  - end count




In [1]:
import pandas as pd
from pathlib import Path

In [2]:
def agg_month(year, month):
    # number with year and month together
    yearmonth = (year*100) + month
    # define path to csv
    file_path = Path('Resources', f'{yearmonth}-citibike-tripdata.csv')
    # new structure started on 202102
    new_structure = (yearmonth >= 202102)
    if new_structure:
        # what cols to use
        # 1:rideable_type,
        # 4:start_station_name,
        # 12:member_casual
        use_columns = [1] + list(range(4, 13))
        # column names
        col_names = [
            'rideable_type',
            'name_start',
            'id_start',
            'name_end',
            'id_end',
            'lat_start',
            'lon_start',
            'lat_end',
            'lon_end',
            'user_type'
        ]
    else:
        # 3"start station id",
        # 10"end station lon",
        # 12"usertype"
        use_columns = list(range(3, 11)) + [12]
        # column names
        col_names = [
            'id_start',
            'name_start',
            'lat_start',
            'lon_start',
            'id_end',
            'name_end',
            'lat_end',
            'lon_end',
            'user_type'
        ]
    # read csv with desired columns and drop NAs
    df = pd.read_csv(
        file_path,
        header=0,
        usecols=use_columns,
        names=col_names,
        dtype='object'
    ).dropna()
    # add missing columns in old structure
    if not new_structure:
        df['rideable_type'] = 'classic_bike'
    # set temp index for pivot long
    df['temp'] = df.index
    df = (pd
        .wide_to_long(
            df=df,
            stubnames=['id', 'name', 'lat', 'lon'],
            i=['temp', 'user_type', 'rideable_type'],
            j='start_or_end',
            sep='_',
            suffix='\D+'
        )
        .reset_index(drop=False)
        .drop(columns=['temp'])
    )
    # count station use
    df = (df
        .value_counts()
        .reset_index(drop=False, name='trip_count')
    )
    # make coordinates floats
    for col in ['lat', 'lon']:
        df[col] = df[col].astype('float64')
    # average coordinates to deal with diff precisions
    df = (df
        .groupby(
            ['user_type','rideable_type','start_or_end','id','name'],
            as_index=False
        )
        .agg({'lat': 'mean', 'lon': 'mean', 'trip_count': 'sum'})
    )
    # add date column as first day of month
    df['date'] = f'01-{month:02}-{year}'
    return df

In [3]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # create empty df to concat months
    df = pd.DataFrame()
    # loop through all months
    for month in range(first_month, 13):
        print(f'  Aggregating month: {month}')
        # run agg_month for each month
        month_df = agg_month(year, month)
        # concatenate with all months
        df = pd.concat(
            objs=[df, month_df],
            ignore_index=True,
            names=month_df.index.names
        )
    return df


In [4]:
# create empty df to concat months
df = pd.DataFrame()
# loop through all years
for year in range(2013, 2024):
    print(f'Aggregating year: {year}')
    # run agg_year for each year's data
    year_df = agg_year(year)
    print(f'  --\n  {len(year_df)} rows in {year}')
    # concatenate with all years
    df = pd.concat(
        objs=[df, year_df],
        ignore_index=True,
        names=year_df.index.names
    )
    print(f'  {len(df)} total rows so far\n  --')
# replace values for old structure of user type
df['user_type'] = df['user_type'].replace({
    'Subscriber': 'member',
    'Customer': 'casual'
})
print(f'Done! {len(df)} final rows')


Aggregating year: 2013
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  9234 rows in 2013
  9234 total rows so far
  --
Aggregating year: 2014
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  15681 rows in 2014
  24915 total rows so far
  --
Aggregating year: 2015
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  18201 rows in 2015
  43116 total rows so far
  --
Aggregating year: 2016
  Aggregating mon

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 727900 entries, 0 to 727899
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_type      727900 non-null  object 
 1   rideable_type  727900 non-null  object 
 2   start_or_end   727900 non-null  object 
 3   id             727900 non-null  object 
 4   name           727900 non-null  object 
 5   lat            727900 non-null  float64
 6   lon            727900 non-null  float64
 7   trip_count     727900 non-null  float64
 8   date           727900 non-null  object 
dtypes: float64(3), object(6)
memory usage: 50.0+ MB


In [6]:
df.nunique()

user_type            2
rideable_type        3
start_or_end         2
id                3853
name              2681
lat              76557
lon              76571
trip_count        8329
date               127
dtype: int64

In [7]:
df.to_csv('201306-202312-citibike-tripdata-station_counts_monthly.csv', index=False, header=True)


---

In [10]:
# To clean vacc site names
df = pd.read_csv(
    Path('..', '201306-202312-citibike-tripdata-station_counts_monthly.csv'),
    dtype='object'
)
for col in ['trip_count', 'lat', 'lon']:
    df[col] = df[col].astype('float64')
df.head()

Unnamed: 0,user_type,rideable_type,start_or_end,id,name,lat,lon,trip_count,date
0,casual,classic_bike,end,116,W 17 St & 8 Ave,40.741776,-74.001497,648.0,01-06-2013
1,casual,classic_bike,end,119,Park Ave & St Edwards St,40.696089,-73.978034,33.0,01-06-2013
2,casual,classic_bike,end,120,Lexington Ave & Classon Ave,40.686768,-73.959282,154.0,01-06-2013
3,casual,classic_bike,end,127,Barrow St & Hudson St,40.731724,-74.006744,999.0,01-06-2013
4,casual,classic_bike,end,128,MacDougal St & Prince St,40.727103,-74.002971,778.0,01-06-2013


In [11]:
# got vaccination sites names from previous eda
vacc_sites = {
    '3 Ave & E 174 St - Bathgate Vaccination Site': '3 Ave & E 174 St',
    '57 St & 1 Ave - Brooklyn Army Terminal Vaccination Site': '57 St & 1 Ave',
    '58 St & 2 Ave - Brooklyn Army Terminal Vaccination Site': '58 St & 2 Ave'
}

df['active_vaccination_site'] = df['name'].isin(vacc_sites.keys())
df['vaccination_site'] = df['name'].isin(vacc_sites.values())

# clean vaccination site station names
df['name'] = df['name'].replace(vacc_sites)

In [12]:
df.nunique()

user_type                      2
rideable_type                  3
start_or_end                   2
id                          3853
name                        2678
lat                        76557
lon                        76571
trip_count                  8329
date                         127
active_vaccination_site        2
vaccination_site               2
dtype: int64

In [13]:
df.to_csv(Path('..', '201306-202312-citibike-tripdata-station_counts_monthly.csv'), index=False, header=True)