# Package Import

In [None]:
from glob import glob
from IPython.display import display, HTML

import numpy as np
import pandas as pd
import geopandas as gpd

import altair as alt
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba
import plotly.graph_objects as go

# Load Data & Data Cleaning

Since data of different months is in different format, we first do some renaming before merging those dataframe.

In [None]:
dirs = glob("data/*.csv")
ls = []
for dir in dirs:
    df = pd.read_csv(dir, low_memory=False)
    df = df.drop(columns=["duration_sec","ride_id","bike_share_for_all_trip","bike_id","rental_access_method"], errors = "ignore")
    df = df.rename(columns = {"start_at":"start_time","started_at":"start_time", "end_at":"end_time", 
              "ended_at":"end_time", "start_lng":"start_station_longitude", 
              "start_lat":"start_station_latitude","end_lng":"end_station_longitude", 
              "end_lat":"end_station_latitude","member_casual":"user_type"})
    if not 'rideable_type' in df.columns:
        df['rideable_type'] = "classic_bike"
    ls.append(df.copy())

In [None]:
# Concatenate dataframes
baywheel = pd.concat(ls).reset_index(drop = True)
len(baywheel)

After some testing and examnation of the dataset (code not included in this notebook), we identified the missingness of our dataset . Some `station_id` and `station_name` are missing because bike can be return/rent from any bike rack within the service area. Some `station_latitude` and `station_longitude` are missing because those are testing/demo/depot stations. We will remove those rows later.

In [None]:
baywheel.isna().mean()

## Time Data

Calculate the duration of each ride based on `start_time` and `end_time`

In [None]:
baywheel["start_time"] = pd.to_datetime(baywheel["start_time"])
baywheel["end_time"] = pd.to_datetime(baywheel["end_time"])
baywheel["duration"] = (baywheel["end_time"] - baywheel["start_time"]).dt.total_seconds()

## User Type

unify `user_type` categories

In [None]:
baywheel["user_type"] = baywheel["user_type"].apply(lambda s: "member" if s == "Subscriber" else s)
baywheel["user_type"] = baywheel["user_type"].apply(lambda s: "casual" if s == "Customer" else s)

## Station

1. Drop rows with missing end/start coordination since those are testing/demo/depot stations and those recording doesn't seems to be generated by users
2. We will use `station_name` instead of `station_id` for the the rest of the project, since lyft chage the id of the stations in 2020
3. Drop rows starting/ending from testing/demo/depot stations 
4. Select trips in San Francisco only (filtered out others by station coordinate)

In [None]:
baywheel = baywheel[baywheel["end_station_latitude"].isna() ^ 1]
baywheel = baywheel[baywheel["start_station_latitude"].isna() ^ 1]

In [None]:
s = {'Minnesota St Depot', 'SF Depot-2 (Minnesota St Outbound)', 'SF Test Station', 'Philly Demo',
     '16th Depot Bike Fleet Station', "MTL-ECO5-01"}

In [None]:
baywheel = baywheel[baywheel["start_station_name"].apply(lambda x: not x in s)
                  & baywheel["end_station_name"].apply(lambda x: not x in s)]

In [None]:
baywheel = baywheel[baywheel["start_station_longitude"]<-122.35]

## Bike Type

The company renamed "docked bike" to "classic bike" in 2020, so we change "classic_bike" to "docked_bike" for 2019 recordings

In [None]:
baywheel["rideable_type"] = baywheel["rideable_type"].apply(lambda s: "classic_bike" if s == "docked_bike" else s)

## Cleaned Dataset

In [None]:
baywheel.head()

In [None]:
baywheel.shape

In [None]:
baywheel.isna().mean()

# Visualization 1 - Station Usage vs Time vs Location

In [None]:
# Extract year-month from trips start_time
baywheel['year'] = baywheel.start_time.dt.year
baywheel['month'] = baywheel.start_time.dt.month
baywheel['year-month'] = baywheel.start_time.dt.to_period('M')

In [None]:
# Extract counts
counts = baywheel["start_station_name"].value_counts()
# Extract location
coor = baywheel[["start_station_name", "start_station_latitude", "start_station_longitude"]].groupby(
   "start_station_name").mean().reset_index()
# Merge
coor = coor.merge(counts, left_on =  "start_station_name", right_index = True, suffixes = ["","_r"] )
coor = coor.rename(columns = {"start_station_name_r":"count"})

In [None]:
# Count the number of trips starting from each stations in each month starting
monthly = baywheel.groupby(['start_station_name','year-month']).size().to_frame().reset_index(drop=False)
monthly = monthly.rename(columns={0:"monthly_count"})
# Calculate latitude/loongitude of each station using the average coordinate of all trips that start from the station
monthly = monthly.merge(coor[["start_station_name","start_station_latitude","start_station_longitude"]],
             how = "left", on = "start_station_name").rename(columns=
            {"start_station_name": "station_name"
             ,"start_station_latitude": "latitude"
             ,"start_station_longitude": "longitude"
            })
# Calculate num_month as the number of month since the beginning of baywheel's operation
monthly["year-month"] = monthly["year-month"].dt.to_timestamp()
ed = monthly["year-month"].max()
st = monthly["year-month"].min()
monthly["num_month"] = 12*(monthly["year-month"].dt.year - st.year) + monthly["year-month"].dt.month - st.month
# Show dataframe
monthly.head()

In [None]:
# Plot stations and their monthly trip count with an interactive scatterplot 
# Create slider
month_slider = alt.binding_range(min=0, max=23, step=1, name="Month Since Operation")
# Create selection
select_date = alt.selection_single(
    fields=['num_month'],
    bind=month_slider, # bind with slider
    init={'num_month': 0},
    name='slider')
# disable altair limit
alt.data_transformers.disable_max_rows()
# Create scatter plot
stations = alt.Chart(monthly).mark_circle().encode(
    longitude = "longitude",
    latitude = "latitude",
    # set size to count and fix scale
    size = alt.Size("monthly_count", scale=alt.Scale(domain=[0,4000], range=[1,500]),title="Monthly Trip Count"),
    # mouse hovering function
    tooltip = ["station_name","monthly_count"]
).add_selection(select_date).transform_filter( # Add selection
    "datum.num_month == slider.num_month"
)
# display plot
# stations.display()

In [None]:
# load map from shp file
dir = 'SHP/city-of-san-francisco-california-neighborhoods/'
bay = gpd.read_file(dir)

In [None]:
# Create the San Fransisco neighborhood map as the background
bay_map = alt.Chart(bay).mark_geoshape(
    stroke='black',
).encode( 
   color = alt.value("lightgrey"),
   tooltip = ["NEIGHBORHO"] # mouse hovering function
).properties( # Set plot size
    width=650,
    height = 600
)

In [None]:
# Combine the two graph
# bay_map+stations

In [None]:
# Plot a bar chart of monthly total trip counts and a vertical line indecating the start of COVID
# Create monthly count dataframe
monthly['year'] = monthly['year-month'].dt.year
monthly['month'] = monthly['year-month'].dt.month
grouped = monthly.groupby(["num_month","month","year"])["monthly_count"].sum().to_frame().reset_index().rename(
    columns = {"monthly_count":"monthly_total"})
# Create bar chart
monthly_total = alt.Chart(grouped).mark_bar(
    size=20 # set bar size
).encode( 
    y=alt.Y('monthly_total',
         axis=alt.Axis(title='Total Trip Count')),
    x=alt.X('num_month:N',
         axis=alt.Axis(title='Month Since Operation')),
    # Link color to the selector
    color=alt.condition(select_date, alt.ColorValue("steelblue"), alt.ColorValue("lightgrey")),
    # Mouse hovering function
    tooltip = ["num_month","monthly_total","month","year"]
).add_selection(select_date).properties( # set plot size
    width=650,
    height=100
)
# Create the red line
# dummy dataframe
overlay = pd.DataFrame({'num_month': [11]})
vline = alt.Chart(overlay).mark_rule(color='red', strokeWidth=3).encode(x='num_month:N') # plot bar plot
# overlay
# monthly_total+vline

In [None]:
# Adjust slider location and combine plots
display(HTML("""
<style>
form.vega-bindings {
  position: absolute;
  right: 0px;
  top: 600px;
}
</style>
"""))
# Combine plot vertically 
alt.vconcat(
    bay_map+stations,
    monthly_total+vline).properties(
    title={ # add title
      "text": ["Monthly Station Usage"], 
      "subtitle": ["The red line indicates the beginning of COVID-19"],
      "color": "black",
      "subtitleColor": "gray",
      "fontSize":30
    }
)

# Visualization 2 - Customer Type & Rideable Type

In [None]:
# disable the max row limiter, which is 5000 observations
alt.data_transformers.disable_max_rows()

# sample 50,000 data from the population randomly since 
# altair cannot visualize data that has a much larger scale
duration = baywheel.copy().sample(50000, random_state=42)\
[['start_time', 'duration', 'user_type', 'rideable_type']]

# transform variables so they can be properly used by altair
duration['start_time'] = duration['start_time'].dt.date
duration['start_time'] = duration['start_time'].apply(lambda x: x.strftime('%Y-%m-%d'))
duration['duration'] = duration['duration']/60
duration['type'] = duration['user_type'] + '_with_' + duration['rideable_type']
duration['type'] = duration['type']\
    .replace({'member_with_classic_bike': 'Member + Classic Bike',
              'casual_with_classic_bike': 'Casual + Classic Bike',
              'member_with_electric_bike': 'Member + Electric Bike',
              'casual_with_electric_bike': 'Casual + Electric Bike',},)

In [None]:
# use a pre-determined set of color and scheme
scale = alt.Scale(domain=['Member + Electric Bike', 'Member + Classic Bike',  
                          'Casual + Classic Bike', 'Casual + Electric Bike'],
                  range=['#e76f51', '#f4a261', '#2a9d8f', '#264653'])
color = alt.Color('type:N', scale=scale, legend=alt.Legend(title='Type'))

# add interaction including brush, interaction, click
# also, assigning different events to resolve conflicts
# between brushing, zooming, and moving operations
brush = alt.selection(
    type="interval",
    encodings=["x"],
    # use alt for brush
    on="[mousedown[event.altKey], mouseup] > mousemove",
    translate="[mousedown[event.altKey], mouseup] > mousemove!",
    zoom="wheel![event.altKey]",
)

interaction = alt.selection(
    type="interval",
    bind="scales",
    # use shift for zooming and moving
    on="[mousedown[event.shiftKey], mouseup] > mousemove",
    translate="[mousedown[event.shiftKey], mouseup] > mousemove!",
    zoom="wheel![event.shiftKey]",
)

click = alt.selection_multi(encodings=['color'])

In [None]:
# start building the altair visualization
source = duration

# The first chart is about the overall trend of sharing counts
trend = alt.Chart(source
# define a line chart
).mark_line(
# step to encode
).encode(
    # encode date as X
    alt.X('yearmonthdate(start_time):T',
          axis=alt.Axis(title='Date')),
    # encode count as Y
    alt.Y('count()', 
          axis=alt.Axis(title='Number of Sharing Records')),
    # encode count using variable line size
    size='count()',
    # group by different user/bike types
    color=alt.condition(click, color, alt.value('lightgray')),
# step to add selection for the chart
).add_selection(
    brush,
    interaction,
# step to add filters for the chart by selection from other charts
).transform_filter(
    click
# defining the layout of the chart
).properties(
    width=600,
    height=200,
    title={
      "text": ["Number of Sharing Records and Duration by \
Date and User/Bike Types"], 
      "subtitle": ["Note: Electric Bike is a new type of \
bike started by Lyft on Apr 01, 2020,",
                   "therefore, no data about electric bike \
is available before that time"],
      "color": "black",
      "subtitleColor": "gray"
    }
)

In [None]:
# The second chart is about the distribution of sharing 
# duration by different user/bike types
distribution = alt.Chart(source
# define a bar chart
).mark_bar(
# step to encode
).encode(
    # encode duration bin as X
    alt.X("duration:Q", bin=alt.Bin(
                                    maxbins=60, 
                                    extent=[0, 60],
                                   ),
          axis=alt.Axis(title='Sharing Duration (in Minutes)')
         ),
    # encode count as Y
    alt.Y('count():Q',
         axis=alt.Axis(title='Number of Sharing Records')),
    # group by different user/bike types
    color=alt.condition(click, color, alt.value('lightgray')),
# step to add selection for the chart
).add_selection(
    click
# step to add filters for the chart by selection from other charts
).transform_filter(
    brush
# defining the layout of the chart
).properties(
    width=600, 
    height=200,
)

In [None]:
# The third chart is about the median of sharing 
# duration by different user/bike types
median = alt.Chart(source
# define a bar chart
).mark_bar(
# step to encode
).encode(
    # encode median of duration as X
    alt.X('median(duration)',
         axis=alt.Axis(title='Median Sharing Duration (in Minutes)')),
    # encode user/bike type as Y
    alt.Y('type',
         axis=alt.Axis(title='Category')),
    # group by different user/bike types
    color=alt.condition(click, color, alt.value('lightgray')),
# step to add selection for the chart
).add_selection(
    click
# step to add filters for the chart by selection from other charts
).transform_filter(
    brush
# defining the layout of the chart
).properties(
    width=600,
    height=100
)

In [None]:
# now, let's visualize it!
alt.vconcat(trend, distribution, median).properties(spacing=5)

# Visualization 3 - Bike Route & Trip Features

In [None]:
# disable the max row limiter, which is 5000 observations
alt.data_transformers.disable_max_rows()

# transform variables so they can be properly used by altair
sharing_time = baywheel[['start_time', 'end_time', 'user_type', 'rideable_type', 
                        'start_station_name', 'end_station_name']].copy().dropna()
sharing_time['start_time'] = sharing_time['start_time'].dt.hour + \
                             sharing_time['start_time'].dt.minute/60+ \
                             sharing_time['start_time'].dt.second/3600
sharing_time['end_time'] = sharing_time['end_time'].dt.hour + \
                             sharing_time['end_time'].dt.minute/60+ \
                             sharing_time['end_time'].dt.second/3600
sharing_time['type'] = sharing_time['user_type'] + '_with_' + sharing_time['rideable_type']
sharing_time['type'] = sharing_time['type'].replace({'member_with_classic_bike': 'Member + Classic Bike',
                                          'casual_with_classic_bike': 'Casual + Classic Bike',
                                          'member_with_electric_bike': 'Member + Electric Bike',
                                          'casual_with_electric_bike': 'Casual + Electric Bike',},)
sharing_time = sharing_time[sharing_time.start_station_name.isin(
    sharing_time.start_station_name.value_counts().head(20).index.tolist())]
sharing_time = sharing_time[sharing_time.end_station_name.isin(
    sharing_time.end_station_name.value_counts().head(20).index.tolist())]

# sample 50,000 data from the population randomly since 
# altair cannot visualize data that has a much larger scale
sharing_time = sharing_time.sample(50000, random_state=42)

In [None]:
# add interaction including brush, interaction, click, and select
# also, assigning different events to resolve conflicts
# between brushing, zooming, and moving operations
select = alt.selection_multi(encodings=['x', 'y'])
click = alt.selection_multi(encodings=['color'])

brush = alt.selection(
    type="interval",
    encodings=['x', 'y'],
    # use alt for brush
    on="[mousedown[event.altKey], mouseup] > mousemove",
    translate="[mousedown[event.altKey], mouseup] > mousemove!",
    zoom="wheel![event.altKey]",
)

interaction = alt.selection(
    type="interval",
    bind="scales",
    # use shift for zooming and moving
    on="[mousedown[event.shiftKey], mouseup] > mousemove",
    translate="[mousedown[event.shiftKey], mouseup] > mousemove!",
    zoom="wheel![event.shiftKey]",
)

# use a pre-determined set of color and scheme
scale = alt.Scale(domain=['Member + Electric Bike', 'Member + Classic Bike',  
                          'Casual + Classic Bike', 'Casual + Electric Bike'],
                  range=['#e76f51', '#f4a261', '#2a9d8f', '#264653'])

In [None]:
# start building the altair visualization
source = sharing_time

# The first part is a heatmap linking start & end station
heat = alt.Chart(source
# define a rectangular chart
).mark_rect(
# step to encode
).encode(
    # encode start station name as X
    alt.X('start_station_name', 
          axis=alt.Axis(
              title='Start Station Name',
              labelLimit=300,
              titlePadding=100
      )),
    # encode end station name as Y
    alt.Y('end_station_name',
          axis=alt.Axis(
              title='End Station Name',
              labelLimit=300,
              titlePadding=40
      )),
    # encode color by a continuous scale and conditions
    color=alt.condition(select, 
                        alt.Color(
                            'count()', 
                            scale=alt.Scale(
                                scheme='lightorange'),
                            legend=alt.Legend(
                                orient='left',
                            title='Count')), 
                        alt.value('lightgray'))
# step to add selection for the chart
).add_selection(
    select
# step to add filters for the chart by selection from other charts
).transform_filter(
    click
# step to add filters for the chart by selection from other charts
).transform_filter(
    brush
# defining the layout of the chart
).properties(
    height = 250, 
    width = 250,
    title={
      "text": ["Most Frequent Bike Route"], 
      "subtitle": ["Note: if the color between a start station ", 
                   "and an end station is white, it means that no", 
                   "route is recorded between these two locations"],
      "color": "black",
      "subtitleColor": "gray"
    }
)

In [None]:
# The first part is a scatter plot linking start & end time
scatter = alt.Chart(source
# define a chart with circles
).mark_circle(size=30
# step to encode
).encode(
    # encode start time as X
    alt.X('start_time', 
          scale=alt.Scale(
              domain=[0, 23]), 
          axis=alt.Axis(
              title='Start Sharing Time (in Hours of the Day)', 
              values=list(range(0,24)))),
    # encode end time as Y
    alt.Y('end_time', 
          scale=alt.Scale(
              domain=[0, 23]), 
          axis=alt.Axis(
              title='End Sharing Time (in Hours of the Day)', 
              values=list(range(0,24)))),
    # encode color by a given scheme and conditions
    color=alt.condition(click, 
                      alt.Color('type:N', 
                                scale=scale, 
                                legend=alt.Legend(orient='bottom',
                                                  title='Type', 
                                                  direction='vertical')),
                      alt.value('lightgray'))
# step to add selection for the chart
).add_selection(
    click, 
    brush, 
    interaction
# step to add filters for the chart by selection from other charts
).transform_filter(
    select
# defining the layout of the chart
).properties(
    height = 250, 
    width = 250,
    title={
      "text": ["Sharing Start & End Time by Types of Users"], 
      "subtitle": ["Note: if the end sharing time is greater than",
                   "the start sharing time, it means that the person",
                   "returned the bike on the second day of sharing"],
      "color": "black",
      "subtitleColor": "gray"
    }
)

In [None]:
# now, let's visualize it!
alt.hconcat(heat, scatter).properties(spacing=5)

# Visualization 4 - Trip Counts Between Stations

In [None]:
# Select the top 5 most popular stations
station_counts = coor[["start_station_name","count"]].copy()
top_five = station_counts.sort_values("count",ascending = False).head(5)
top_stations = list(top_five["start_station_name"])

In [None]:
# Prepare data for sankey diagram.
# Select the top five station from the sampled dataframe and count trips
sankey_data = sharing_time[
sharing_time["start_station_name"].apply(
    lambda x: x in top_stations
) & sharing_time["end_station_name"].apply(
    lambda x: x in top_stations
)
].groupby(["start_station_name","end_station_name"]).agg({"user_type":"size"}).reset_index()
sankey_data = sankey_data.rename(columns ={"user_type":"counts"})
# Transform Stations to node ids
to_idx = dict((s,i) for i,s in enumerate(top_stations))
sankey_data["start_id"] = sankey_data["start_station_name"].apply(lambda s : to_idx[s])
sankey_data["end_id"] = sankey_data["end_station_name"].apply(lambda s : to_idx[s]) + 5

In [None]:
# Configurate colors
# Color for links
to_color = {
    0: "skyblue",
    1: "violet",
    2: "Khaki",
    3: "lightgreen",
    4: "Coral"
}
sankey_data['color'] = sankey_data["start_id"].apply(lambda x: to_color[x])
# Set the same color for labels
labels_color = ["skyblue","violet","Khaki","lightgreen","red"]
labels_color += labels_color

In [None]:
# Data for sankey diagram
sankey_data.head()

In [None]:
# Create labels
labels = [ "start - " + s for s in top_stations] + [ "end - " + s for s in top_stations]

# Plot sankey diagram
fig = go.Figure(data=[go.Sankey(
    # Config nodes
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0),
      label = labels,
      color = labels_color
    ),
    # Config links
    link = dict(
      source = sankey_data["start_id"],
      target = sankey_data["end_id"],
      value = sankey_data["counts"],
      # Control the opacity of the color
      color = ["rgba({},{},{},{})".format(*to_rgba(color, alpha=0.3)) for color in sankey_data["color"]]
  ))])
# Add title
fig.update_layout(title_text="Trip Counts Between the Top 5 Most Popular Stations", font_size=10)
# Plot figure
fig.show()