## Citibikes data processing: Part 3

Objectives:

- Count daily station use:
  - date
  - station id
  - start count
  - end count

- Get station data in separate table:
  - name
  - id
  - longitude
  - latitude




In [1]:
import pandas as pd
from pathlib import Path

In [2]:
def get_dateformat(yearmonth, a_date):
    # date format is different btw 201409 and 201609
    if (yearmonth >= 201409 and yearmonth <= 201609):
        date_format = '%m/%d/%Y'
    else:
        date_format = '%Y-%m-%d'
    # check if missing seconds, add time format
    if len(a_date) < 16:
        date_format = date_format + ' %H:%M'
    else:
        date_format = date_format + ' %H:%M:%S'
    return date_format

In [3]:
# def get_station_data(year, month):
#     # number with year and month together
#     yearmonth = (year*100) + month
#     # define path to csv
#     file_path = Path('Resources', f'{yearmonth}-citibike-tripdata.csv')
#     # new structure started on 202102
#     new_structure = (yearmonth >= 202102)
#     if new_structure:
#         # what cols to use
#         # 1: rideable_type,
#         # 2:started_at,3:ended_at,4:start_station_name,5:start_station_id,6:end_station_name,7:end_station_id,8:start_lat,9:start_lng,10:end_lat,11:end_lng,12:member_casual
#         use_columns = [1, 2, 3, 12]
#         start_col = 'started_at'
#         end_col = 'ended_at'
#     else:
#         # 1"starttime",2"stoptime",3"start station id",4"start station name",5"start station latitude",6"start station longitude",7"end station id",8"end station name",9"end station latitude",10"end station longitude",12"usertype"
#         use_columns = [0, 1, 2, 12]
#         start_col = 'starttime'
#         end_col = 'stoptime'


In [4]:
def agg_month(year, month):
    # number with year and month together
    yearmonth = (year*100) + month
    # define path to csv
    file_path = Path('Resources', f'{yearmonth}-citibike-tripdata.csv')
    # column names
    col_names = ['date', 'station_id_start', 'station_id_end', 'user_type']
    # new structure started on 202102
    new_structure = (yearmonth >= 202102)
    if new_structure:
        # what cols to use
        # 1: rideable_type,
        # 2:started_at,
        # 5:start_station_id,
        # 7:end_station_id,
        # 12:member_casual
        use_columns = [1, 2, 5, 7, 12]
        col_names = ['rideable_type'] + col_names
    else:
        # 1"starttime",
        # 3"start station id",
        # 7"end station id",
        # 12"usertype"
        use_columns = [1, 3, 7, 12]
    # read csv with desired columns and drop NAs
    df = pd.read_csv(
        file_path,
        header=0,
        usecols=use_columns,
        names=col_names,
        dtype='object'
    ).dropna()
    # get date format from the first record of start time column
    date_format = get_dateformat(yearmonth, df['date'][0])
    # convert to date type and get date part only
    df['date'] = pd.to_datetime(
        df['date'], format=date_format
    ).dt.date
    # add missing columns in old structure
    if not new_structure:
        df['rideable_type'] = 'classic_bike'
    # melt df station id column
    df = (df
        .melt(
            id_vars=['date', 'user_type', 'rideable_type'],
            var_name='start_or_end',
            value_name='station_id'
        )
    )
    df['start_or_end'] = df['start_or_end'].str[11:]

    # get counts of each combination of col values
    df = (df
        .value_counts()
        .reset_index(drop=False, name='trip_count')
    )
    return df

In [5]:
def agg_year(year):
    # 2013 starts in june (6)
    if year == 2013:
        first_month = 6
    else:
        first_month = 1
    # create empty df to concat months
    df = pd.DataFrame()
    # loop through all months
    for month in range(first_month, 13):
        print(f'  Aggregating month: {month}')
        # run agg_month for each month
        month_df = agg_month(year, month)
        # concatenate with all months
        df = pd.concat(
            objs=[df, month_df],
            ignore_index=True,
            names=month_df.index.names
        )
    return df


In [6]:
# create empty df to concat months
df = pd.DataFrame()
# loop through all years
for year in range(2013, 2024):
    print(f'Aggregating year: {year}')
    # run agg_year for each year's data
    year_df = agg_year(year)
    print(f'  --\n  {len(year_df)} rows in {year}')
    # concatenate with all years
    df = pd.concat(
        objs=[df, year_df],
        ignore_index=True,
        names=year_df.index.names
    )
    print(f'  {len(df)} total rows so far\n  --')
print(f'Done! {len(df)} final rows')


Aggregating year: 2013
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  259561 rows in 2013
  259561 total rows so far
  --
Aggregating year: 2014
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  402762 rows in 2014
  662323 total rows so far
  --
Aggregating year: 2015
  Aggregating month: 1
  Aggregating month: 2
  Aggregating month: 3
  Aggregating month: 4
  Aggregating month: 5
  Aggregating month: 6
  Aggregating month: 7
  Aggregating month: 8
  Aggregating month: 9
  Aggregating month: 10
  Aggregating month: 11
  Aggregating month: 12
  --
  470800 rows in 2015
  1133123 total rows so far
  --
Aggregating year: 2016
  Aggreg

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16868694 entries, 0 to 16868693
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   date           object 
 1   user_type      object 
 2   rideable_type  object 
 3   start_or_end   object 
 4   station_id     object 
 5   trip_count     float64
dtypes: float64(1), object(5)
memory usage: 772.2+ MB


In [8]:
df.to_csv('201306-202312-citibike-tripdata-station_counts.csv', index=False, header=True)

In [9]:
df.nunique()

date             3857
user_type           4
rideable_type       3
start_or_end        2
station_id       3853
trip_count        807
dtype: int64