In [19]:
import pandas as pd
import numpy as np

In [20]:
# Define file pathname
base = "/Users/ziadharmanani/Documents/GitHub/INF1340-Ridership/Ridership-Data/Bike share ridership "
extension = [
    "2023-01", "2023-02", "2023-03", "2023-04", "2023-05", "2023-06",
    "2023-07", "2023-08", "2023-09", "2023-10", "2023-11", "2023-12",
    ]

# Define columns order in a list
columns = [
    "Trip Id", "Trip Duration", "Start Station Id", "Start Time",
    "Start Station Name", "End Station Id", "End Time",
    "End Station Name", "Bike Id", "User Type"
    ]

# Used to store file paths
paths = []

# Loop through file extensions and combine them to locate them individually
for month in extension:
    path = base + month + ".csv"
    paths.append(path)

# Used to store the individual data frames
all_dfs = []

# Clean columns (Inconsistencies between 2024 and other years)
# Mismatch beteen the years issued a lot of NaN values. Trip Id was duplicated (2024 had its own).
for file in paths:
    df_temp = pd.read_csv(file, encoding = "latin1")
    df_temp.columns = [col.replace("ï»¿", "").strip() for col in df_temp.columns]

    # Used to clean BOM character for "Trip Id" (2024)
    if "ï»¿Trip Id" in df_temp.columns:
        df_temp["Trip Id"] = df_temp["ï»¿Trip Id"]
        df_temp = df_temp.drop(columns = ["ï»¿Trip Id"])

    #remove extra space in Trip Duration column
    #df.columns = df.columns.str.replace("Trip  Duration", "Trip Duration")
    df_temp.rename(columns={"Trip  Duration": "Trip Duration"}, inplace=True)

    # Reindexes the data frame according to columns list.
    df_temp = df_temp.reindex(columns = columns)
    all_dfs.append(df_temp)

# Concatinate dataframe
df = pd.concat(all_dfs, ignore_index = True)

# Drop "Model" to stay consistent with previous years (Only present in 2024)
df = df.drop(columns = ["Model", "Bike Id"], errors = "ignore")


df = df.set_index("Trip Id")

# NEXT STEPS: Look into Start Station Name and End Station Name.


In [21]:
df.columns.tolist()

['Trip Duration',
 'Start Station Id',
 'Start Time',
 'Start Station Name',
 'End Station Id',
 'End Time',
 'End Station Name',
 'User Type']

In [22]:
print(f"Start Station Name: {df['Start Station Name'].nunique()}")
print(f"End Station Name: {df['End Station Name'].nunique()}")

Start Station Name: 593
End Station Name: 592


In [23]:
df.isna().sum()

Trip Duration              0
Start Station Id           0
Start Time                 0
Start Station Name    595075
End Station Id          2944
End Time                   0
End Station Name      598563
User Type                  0
dtype: int64

In [24]:
### START TIME
#split start time into 2 columns
df[['Start Date', 'Start Time']] = df['Start Time'].str.split(' ', expand=True)

#move start date to the 3rd index
col = df.pop('Start Date')
df.insert(3, col.name, col)

#convert date to datetime format
df['Start Date'] = pd.to_datetime(df['Start Date'], format = '%m/%d/%Y')

In [25]:
### END TIME
#split start time into 2 columns
df[['End Date', 'End Time']] = df['End Time'].str.split(' ', expand=True)

#move start date to the 3rd index
col = df.pop('End Date')
df.insert(3, col.name, col)

#convert date to datetime format
df['End Date'] = pd.to_datetime(df['End Date'], format = '%m/%d/%Y')

In [26]:
### START TIME
#split date into day/month but keep og column
df['Start Day'] = df['Start Date'].dt.day

df['Start Month'] = df['Start Date'].dt.month

#move start day to the 4th index
col = df.pop('Start Day')
df.insert(4, col.name, col)

#move start month to the 5th index
col = df.pop('Start Month')
df.insert(5, col.name, col)


#change trip duration to minutes/hours
def convert(seconds):
    seconds = seconds % (24 * 60 * 60)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    return "%d:%02d:%02d" % (hour, minutes, seconds)


#df['Trip Duration'] = df['Trip Duration'].apply(convert)
df.tail()

Unnamed: 0_level_0,Trip Duration,Start Station Id,Start Time,End Date,Start Day,Start Month,Start Date,Start Station Name,End Station Id,End Time,End Station Name,User Type
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
26682732,185,7391,23:56,2023-12-31,31,12,2023-12-31,Yonge St / Dundas Sq,7041.0,23:59,Edward St / Yonge St,Casual Member
26682733,802,7366,23:56,2024-01-01,31,12,2023-12-31,Fort York Blvd / Bathurst St SMART,7048.0,00:10,Front St W / Yonge St (Hockey Hall of Fame),Casual Member
26682735,1314,7203,23:57,2024-01-01,31,12,2023-12-31,Bathurst St/Queens Quay(Billy Bishop Airport),7719.0,00:19,,Casual Member
26682737,371,7788,23:58,2024-01-01,31,12,2023-12-31,,7788.0,00:05,,Casual Member
26682738,1271,7298,23:59,2024-01-01,31,12,2023-12-31,Bathurst St / Adelaide St W,7075.0,00:21,Queens Quay W / Dan Leckie Way,Casual Member


In [27]:
### END TIME
#split date into day/month but keep og column
df['End Day'] = df['End Date'].dt.day

df['End Month'] = df['End Date'].dt.month

#move start day to the 4th index
col = df.pop('End Day')
df.insert(4, col.name, col)

#move start month to the 5th index
col = df.pop('End Month')
df.insert(5, col.name, col)


#change trip duration to minutes/hours
def convert(seconds):
    seconds = seconds % (24 * 60 * 60)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    return "%d:%02d:%02d" % (hour, minutes, seconds)


#df['Trip Duration'] = df['Trip Duration'].apply(convert)
df.tail()

Unnamed: 0_level_0,Trip Duration,Start Station Id,Start Time,End Date,End Day,End Month,Start Day,Start Month,Start Date,Start Station Name,End Station Id,End Time,End Station Name,User Type
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
26682732,185,7391,23:56,2023-12-31,31,12,31,12,2023-12-31,Yonge St / Dundas Sq,7041.0,23:59,Edward St / Yonge St,Casual Member
26682733,802,7366,23:56,2024-01-01,1,1,31,12,2023-12-31,Fort York Blvd / Bathurst St SMART,7048.0,00:10,Front St W / Yonge St (Hockey Hall of Fame),Casual Member
26682735,1314,7203,23:57,2024-01-01,1,1,31,12,2023-12-31,Bathurst St/Queens Quay(Billy Bishop Airport),7719.0,00:19,,Casual Member
26682737,371,7788,23:58,2024-01-01,1,1,31,12,2023-12-31,,7788.0,00:05,,Casual Member
26682738,1271,7298,23:59,2024-01-01,1,1,31,12,2023-12-31,Bathurst St / Adelaide St W,7075.0,00:21,Queens Quay W / Dan Leckie Way,Casual Member


In [28]:
#create weekday/weekend column
df['Weekday/Weekend'] = df['Start Date'].dt.weekday.apply(lambda x: 'Weekday' if x < 5 else 'Weekend')

df.head()

Unnamed: 0_level_0,Trip Duration,Start Station Id,Start Time,End Date,End Day,End Month,Start Day,Start Month,Start Date,Start Station Name,End Station Id,End Time,End Station Name,User Type,Weekday/Weekend
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20148784,840,7022,00:00,2023-01-01,1,1,1,1,2023-01-01,Simcoe St / Queen St W,7703.0,00:14,,Casual Member,Weekend
20148785,722,7399,00:01,2023-01-01,1,1,1,1,2023-01-01,Lower Jarvis / Queens Quay E,7533.0,00:13,Housey St / Dan Leckie Way,Casual Member,Weekend
20148786,1054,7269,00:02,2023-01-01,1,1,1,1,2023-01-01,Toronto Eaton Centre (Yonge St),7076.0,00:20,York St / Queens Quay W,Annual Member,Weekend
20148790,1329,7721,00:04,2023-01-01,1,1,1,1,2023-01-01,,7685.0,00:26,,Casual Member,Weekend
20148791,1291,7721,00:04,2023-01-01,1,1,1,1,2023-01-01,,7685.0,00:26,,Casual Member,Weekend


In [29]:
# Checking if the Weekday/Weekend column is correct in July
df[(df['Weekday/Weekend'] == 'Weekend') & (df['Start Date'].dt.year == 2023) & (df['Start Date'].dt.month == 7)].sample(10)

Unnamed: 0_level_0,Trip Duration,Start Station Id,Start Time,End Date,End Day,End Month,Start Day,Start Month,Start Date,Start Station Name,End Station Id,End Time,End Station Name,User Type,Weekday/Weekend
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
22733847,1463,7785,21:49,2023-07-02,2,7,2,7,2023-07-02,,7785.0,22:13,,Casual Member,Weekend
23078018,861,7253,16:49,2023-07-15,15,7,15,7,2023-07-15,John St / Mercer St - SMART,7074.0,17:03,King St E / Church St,Casual Member,Weekend
23273233,176,7140,16:25,2023-07-22,22,7,22,7,2023-07-22,Macpherson Ave / Spadina Rd,7142.0,16:28,Bridgeman Ave / Bathurst St,Casual Member,Weekend
23274955,196,7543,17:13,2023-07-22,22,7,22,7,2023-07-22,Nassau St / Bellevue Ave,7199.0,17:16,College St / Markham St,Casual Member,Weekend
23452535,3540,7260,02:16,2023-07-29,29,7,29,7,2023-07-29,Spadina Ave / Adelaide St W,7660.0,03:15,285 Victoria St,Casual Member,Weekend
23285523,1477,7286,23:32,2023-07-22,22,7,22,7,2023-07-22,Gerrard St E / Broadview - SMART,7181.0,23:56,Lansdowne Ave / Whytock Ave,Casual Member,Weekend
23456872,442,7246,14:34,2023-07-29,29,7,29,7,2023-07-29,Yonge St / Bloor St,7038.0,14:41,Dundas St W / Yonge St,Casual Member,Weekend
23483361,482,7548,13:54,2023-07-30,30,7,30,7,2023-07-30,St Joseph St / Bay St - SMART,7038.0,14:02,Dundas St W / Yonge St,Casual Member,Weekend
23276611,486,7150,18:01,2023-07-22,22,7,22,7,2023-07-22,Dufferin St / Sylvan Av (Dufferin Grove Park),7240.0,18:09,Bloor St W / Shaw Ave - SMART,Casual Member,Weekend
22727436,297,7760,15:19,2023-07-02,2,7,2,7,2023-07-02,,7556.0,15:24,Windsor St / Newcastle St,Casual Member,Weekend


In [30]:
# Added a new column Trip Duration (mm:ss)
df['Trip Duration (mm:ss)'] = df['Trip Duration'].apply(lambda x: f"{x // 60}:{x % 60:02d}")

df.sample(10)

Unnamed: 0_level_0,Trip Duration,Start Station Id,Start Time,End Date,End Day,End Month,Start Day,Start Month,Start Date,Start Station Name,End Station Id,End Time,End Station Name,User Type,Weekday/Weekend,Trip Duration (mm:ss)
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
25267942,555,7432,10:33,2023-10-01,1,10,1,10,2023-10-01,Frederick St / King St E,7795.0,10:42,,Casual Member,Weekend,9:15
22544168,356,7416,11:13,2023-06-25,25,6,25,6,2023-06-25,Spadina Ave / Blue Jays Way,7070.0,11:19,25 York St  Union Station South,Casual Member,Weekend,5:56
23089581,710,7016,10:19,2023-07-16,16,7,16,7,2023-07-16,Bay St / Queens Quay W (Ferry Terminal),7075.0,10:31,Queens Quay W / Dan Leckie Way,Casual Member,Weekend,11:50
22755836,812,7251,19:47,2023-07-03,3,7,3,7,2023-07-03,The Royal Ontario Museum (Bloor St Entrance),7719.0,20:00,,Casual Member,Weekday,13:32
23913246,664,7802,16:53,2023-08-14,14,8,14,8,2023-08-14,,7666.0,17:05,Dundas St W / St Helen Ave - SMART,Casual Member,Weekday,11:04
23960655,1025,7111,12:27,2023-08-16,16,8,16,8,2023-08-16,King St W / Douro St,7032.0,12:44,Augusta Ave / Dundas St W,Casual Member,Weekday,17:05
21527559,535,7386,21:57,2023-05-14,14,5,14,5,2023-05-14,D'Arcy St. /McCaul St. SMART,7721.0,22:06,,Casual Member,Weekend,8:55
26577506,187,7127,14:39,2023-12-16,16,12,16,12,2023-12-16,Bay St / Scollard St,7129.0,14:42,Davenport Rd / Avenue Rd,Casual Member,Weekend,3:07
23568651,567,7389,10:06,2023-08-02,2,8,2,8,2023-08-02,College Park- Gerrard Entrance,7432.0,10:15,Frederick St / King St E,Casual Member,Weekday,9:27
25743646,519,7021,01:55,2023-10-22,22,10,22,10,2023-10-22,Bay St / Albert St,7006.0,02:04,Bay St / College St (East Side),Casual Member,Weekend,8:39


In [31]:
# Lists that determine what we defined as peak hours
peak_morning = [6, 7, 8, 9]
peak_evening = [15, 16, 17, 18, 19]

# Assign hour the 'Start Time' hour
hours = pd.to_datetime(df['Start Time'], format='%H:%M').dt.hour

# Defined list for a cleaner code 
conditions = [hours.isin(peak_morning), hours.isin(peak_evening)]
choices = ["Morning", "Evening"]

df['Peak Hour'] = np.select(conditions, choices, default = "Off Peak")

df.sample(10)

Unnamed: 0_level_0,Trip Duration,Start Station Id,Start Time,End Date,End Day,End Month,Start Day,Start Month,Start Date,Start Station Name,End Station Id,End Time,End Station Name,User Type,Weekday/Weekend,Trip Duration (mm:ss),Peak Hour
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
24056247,304,7032,16:47,2023-08-19,19,8,19,8,2023-08-19,Augusta Ave / Dundas St W,7545.0,16:52,Baldwin St / Henry St,Casual Member,Weekend,5:04,Evening
20919214,1652,7526,14:35,2023-04-10,10,4,10,4,2023-04-10,Bartlett Parkette,7462.0,15:03,Richmond St E / Yonge St,Annual Member,Weekday,27:32,Off Peak
26499771,312,7339,07:43,2023-12-10,10,12,10,12,2023-12-10,Carlaw Ave / Strathcona Ave,7431.0,07:49,Gerrard St E / Leslie St,Casual Member,Weekend,5:12,Morning
20282941,400,7044,14:53,2023-01-20,20,1,20,1,2023-01-20,Church St / Alexander St,7021.0,15:00,Bay St / Albert St,Annual Member,Weekday,6:40,Off Peak
21509368,1299,7440,09:14,2023-05-14,14,5,14,5,2023-05-14,Martin Goodman Trail / Ontario Dr,7253.0,09:36,John St / Mercer St - SMART,Casual Member,Weekend,21:39,Morning
25866778,1776,7257,16:50,2023-10-27,27,10,27,10,2023-10-27,Dundas St W / St. Patrick St,7483.0,17:20,Southwood Dr / Kingston Rd - SMART,Casual Member,Weekday,29:36,Evening
22007990,1256,7576,19:13,2023-06-03,3,6,3,6,2023-06-03,Front St E / Bayview Avenue,7542.0,19:34,Queen St W / John St,Casual Member,Weekend,20:56,Evening
24347805,1336,7702,06:32,2023-08-30,30,8,30,8,2023-08-30,,7000.0,06:54,Fort York Blvd / Capreol Ct,Casual Member,Weekday,22:16,Morning
24522316,951,7155,10:02,2023-09-05,5,9,5,9,2023-09-05,Bathurst St / Lennox St,7059.0,10:17,Front St W / Blue Jays Way,Casual Member,Weekday,15:51,Off Peak
21070213,782,7111,07:41,2023-04-18,18,4,18,4,2023-04-18,King St W / Douro St,7023.0,07:54,College St / Borden St,Annual Member,Weekday,13:02,Morning


In [32]:
df.columns.tolist()

['Trip Duration',
 'Start Station Id',
 'Start Time',
 'End Date',
 'End Day',
 'End Month',
 'Start Day',
 'Start Month',
 'Start Date',
 'Start Station Name',
 'End Station Id',
 'End Time',
 'End Station Name',
 'User Type',
 'Weekday/Weekend',
 'Trip Duration (mm:ss)',
 'Peak Hour']

In [37]:
order = ['Trip Duration', 'Trip Duration (mm:ss)', 
         'Start Station Id', 'Start Station Name', 'Start Time', 'Start Day', 'Start Month', 'Start Date', 
         'End Station Id', 'End Station Name', 'End Time', 'End Day', 'End Month', 'End Date',
         'User Type', 'Weekday/Weekend', 'Peak Hour']

df = df[order]
df = df.drop(columns = ["Trip Duration"], errors = "ignore")
df.sample(10)

Unnamed: 0_level_0,Trip Duration (mm:ss),Start Station Id,Start Station Name,Start Time,Start Day,Start Month,Start Date,End Station Id,End Station Name,End Time,End Day,End Month,End Date,User Type,Weekday/Weekend,Peak Hour
Trip Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
22497524,6:14,7543,Nassau St / Bellevue Ave,10:57,23,6,2023-06-23,7207.0,Dundas St W / Crawford St,11:03,23,6,2023-06-23,Casual Member,Weekday,Off Peak
24597048,5:09,7411,Little Norway Park,20:12,7,9,2023-09-07,7175.0,HTO Park (Queens Quay W),20:17,7,9,2023-09-07,Casual Member,Weekday,Off Peak
21471378,44:22,7542,Queen St W / John St,19:00,12,5,2023-05-12,7098.0,Riverdale Park South (Broadview Ave),19:45,12,5,2023-05-12,Casual Member,Weekday,Evening
24027546,32:42,7521,Emerson Ave / Bloor St W,17:21,18,8,2023-08-18,7347.0,Regal Rd / Dufferin St,17:54,18,8,2023-08-18,Casual Member,Weekday,Evening
23678686,19:50,7176,Bathurst St / Fort York Blvd,20:59,5,8,2023-08-05,7193.0,Queen St W / Gladstone Ave,21:18,5,8,2023-08-05,Casual Member,Weekend,Off Peak
23680434,18:59,7701,,22:14,5,8,2023-08-05,7545.0,Baldwin St / Henry St,22:33,5,8,2023-08-05,Casual Member,Weekend,Off Peak
23000842,4:17,7033,Union Station,19:31,12,7,2023-07-12,7076.0,York St / Queens Quay W,19:35,12,7,2023-07-12,Casual Member,Weekday,Evening
24694929,72:52,7285,Spadina Ave / Harbord St - SMART,12:00,11,9,2023-09-11,7285.0,Spadina Ave / Harbord St - SMART,13:13,11,9,2023-09-11,Casual Member,Weekday,Off Peak
24221417,5:35,7385,20 Charles St E,17:33,25,8,2023-08-25,7271.0,Yonge St / Alexander St - SMART,17:39,25,8,2023-08-25,Casual Member,Weekday,Evening
22939193,18:51,7015,King St W / Bay St (West Side),16:41,10,7,2023-07-10,7097.0,Riverdale Park North (Broadview Ave),17:00,10,7,2023-07-10,Casual Member,Weekday,Evening
