# 1. Data Cleaning

In [1]:
import pandas as pd

excel_2016 = ["bikeshare-ridership-2016-q3.xlsx", "bikeshare-ridership-2016-q4.xlsx"]
q_df_2016 = [pd.read_excel(file) for file in excel_2016]

csv_2017 = ["Bikeshare Ridership (2017 Q1).csv", "Bikeshare Ridership (2017 Q2).csv", 
              "Bikeshare Ridership (2017 Q3).csv", "Bikeshare Ridership (2017 Q4).csv"]
q_df_2017 = [pd.read_csv(file) for file in csv_2017]

for i in range(2):
     q_df_2017[i].drop(['from_station_id', 'to_station_id'], axis=1, inplace=True)

q_df = q_df_2016 + q_df_2017
        
for i in range(6):
    q_df[i].drop(['trip_id'], axis=1, inplace=True)

KeyboardInterrupt: 

In [None]:
# Get the year, month, and day for every trip
for i in range(6):
    if i < 2:
        q_df[i]['year'] = 2016
    else:
        q_df[i]['year'] = 2017
    q_df[i]['month'] = [int(date.split('/')[1]) if (isinstance(date, str) and (i == 1 or i == 2 
                        or i == 3)) else int(date.split('/')[0]) if (isinstance(date, str) and 
                        (i == 4 or i == 5)) else int(date.day) if i == 1 else int(date.month) 
                        for date in q_df[i]['trip_start_time']] 
    q_df[i]['day'] = [int(date.split('/')[0]) if (isinstance(date, str) and (i == 1 or i == 2 
                        or i == 3)) else int(date.split('/')[1]) if (isinstance(date, str) 
                        and (i == 4 or i == 5)) else int(date.month) if i == 1 else int(date.day)
                      for date in q_df[i]['trip_start_time']]
    q_df[i]['hour'] = [int(date.split(" ")[1].split(":")[0]) if isinstance(date, str) else 
                            int(date.hour) for date in q_df[i]['trip_start_time']]
q_df[0] = q_df[0][q_df[0].month != 1]

trips = pd.concat(q_df)

In [None]:
# Scrape the weather data

years = [2016 for i in range (6)] + [2017 for i in range(12)]
months = [i for i in range(7, 13)] + [i for i in range(1, 13)]
date_tups = list(zip(months, years))

weather_urls = ["https://climate.weather.gc.ca/climate_data/daily_data_e.html?\
hlyRange=2002-06-04%7C2020-02-04&dlyRange=2002-06-04%7C2020-02-03&mlyRange=2003-07\
-01%7C2006-12-01&StationID=31688&Prov=ON&urlExtension=_e.html&searchType=stnProx&\
optLimit=yearRange&Month=" + str(month) + "&Day=1&StartYear=1840&EndYear=2020&Year=" + 
str(year) + "&selRowPerPage=25&Line=5&txtRadius=25&optProxType=city&selCity=43%7C39%7C7\
9%7C23%7CToronto&selPark=&txtCentralLatDeg=&txtCentralLatMin=0&txtCentralLatSec=0\
&txtCentralLongDeg=&txtCentralLongMin=0&txtCentralLongSec=0&txtLatDecDeg=&txtLong\
DecDeg=&timeframe=2" for (month, year) in date_tups]

weather_dfs = []

for link in weather_urls:
    weather_data = pd.read_html(link)
    weather_dfs.append(weather_data[0])

In [None]:
# Remove duration outliers
trips = trips[trips.from_station_name != trips.to_station_name]

trip_durations = trips['trip_duration_seconds']
iqr = trip_durations.quantile(0.75) - trip_durations.quantile(0.25)
    
upper_bound = trip_durations.quantile(0.75) + 1.5 * iqr
    
# Remove all outliers from the upper extreme.
trips = trips[trips.trip_duration_seconds <= upper_bound]
# Remove all outliers from the lower extreme.
trips = trips[trips.trip_duration_seconds >= 47]
trips = trips.reset_index().drop(['index'], axis = 1)

In [None]:
trips

# 2. Trip Duration Visualizations

In [None]:
import plotly.express as px

fig = px.histogram(trips, x="trip_duration_seconds", nbins=100,
                   labels={'trip_duration_seconds': 'Duration (sec)', 'count': 'Trips'})
fig.update_layout(title_text='Trip Duration Distribution', title_x=0.5)
fig.show()

# 3. User Type Visualizations

In [None]:
fig = px.violin(trips, x='user_type', y="trip_duration_seconds", box=True, points=False,
               labels={'user_type': "User Type", 'trip_duration_seconds': "Duration (sec)"})
fig.update_layout(title_text='Bike Share Trips by User Type', title_x=0.5)
fig.show()

# 4. Trip Time Visualizations

In [None]:
trips_hour_df = pd.DataFrame({'Hour': range(24),
                       'Trips': trips.groupby(['hour'])['hour'].count(),
                        'Duration (sec)': trips.groupby(['hour'])['trip_duration_seconds'].mean()})

fig = px.bar(trips_hour_df, x='Hour', y='Trips', color='Duration (sec)')
fig.update_layout(title_text='Bike Share Trips by Hour', title_x=0.5)
fig.show()

In [None]:
trips_month_df = pd.DataFrame({'Month': range(12),
                       'Trips': trips.groupby(['month'])['month'].count(),
                        'Duration (sec)': trips.groupby(['month'])['trip_duration_seconds'].mean()})

fig = px.bar(trips_month_df, x='Month', y='Trips', color='Duration (sec)')
fig.update_layout(title_text='Bike Share Trips by Month', title_x=0.5)
fig.show()

# 5. Trip Location Visualizations

In [None]:
# Get the longitude and latitude for each station

import urllib
import json

url = "https://tor.publicbikesystem.net/ube/gbfs/v1/en/station_information"
response = urllib.request.urlopen(url)
station_dict = json.loads(response.read())
station_df = pd.DataFrame(station_dict['data']['stations'])
station_coords = {row['name']: (row['lon'], row['lat']) for _, row in station_df.iterrows()} 

In [None]:
trips['endpoints'] = list(zip(trips['from_station_name'], trips['to_station_name']))
trips

In [None]:
import numpy as np

loc_pairs_np = np.array(trips['endpoints']).tolist()
uniq_pairs = np.unique(loc_pairs_np, axis = 0)

In [None]:
import random

samp_pairs = random.sample(uniq_pairs.tolist(), 200)
samp_tuples = [tuple(samp_pair) for samp_pair in samp_pairs]

In [None]:
station_pairs_v = [station_pair if (station_pair[0] in station_coords.keys()
                    and station_pair[1] in station_coords.keys())
                      else 0 for station_pair in samp_tuples]
station_pairs_v2 = list(filter((0).__ne__, station_pairs_v))

In [None]:
s_pairs_df = pd.DataFrame({'s_start_lon': [station_coords[s_pair[0]][0] 
                                           for s_pair in station_pairs_v2],
                          's_end_lon': [station_coords[s_pair[1]][0]
                                           for s_pair in station_pairs_v2],
                          's_start_lat': [station_coords[s_pair[0]][1] 
                                           for s_pair in station_pairs_v2],
                          's_end_lat': [station_coords[s_pair[1]][1]
                                           for s_pair in station_pairs_v2]})

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
for i in range(len(s_pairs_df)):
    fig.add_trace(
        go.Scattergeo(
            lon = [s_pairs_df['s_start_lon'][i], s_pairs_df['s_end_lon'][i]],
            lat = [s_pairs_df['s_start_lat'][i], s_pairs_df['s_end_lat'][i]],
            mode = 'lines',
            line = dict(width = 1,color = 'red')
        )
    )

In [None]:
fig.show()

In [None]:
from collections import defaultdict

stations1 = [start_station if start_station in station_coords.keys()
                      else 0 for start_station in trips['from_station_name']]
stations2 = list(filter((0).__ne__, stations1))
stations_dict = defaultdict(int)

for station in stations2:
    stations_dict[station] += 1
trips2 = pd.DataFrame({'Station': [station for station in stations_dict.keys()],
                       'Longitude': [station_coords[station][0] for 
                                     station in stations_dict.keys()],
                      'Latitude': [station_coords[station][1] for 
                                   station in stations_dict.keys()],
                      'Trips': [stations_dict[station] for 
                                station in stations_dict.keys()]})

In [None]:
fig = px.density_mapbox(trips2, hover_name='Station', lon='Longitude', lat='Latitude', z='Trips', 
                        mapbox_style="open-street-map", 
                        zoom=11.5, radius=16)
fig.update_layout(title_text='Location of Bike Share Trips', title_x=0.5)
fig.show()

# 6. Trip-Weather Relationship Visualizations

In [None]:
df_choice = (trips['year'] - 2016) * 12 + trips['month'] - 7
choice_tup = list(zip(df_choice, trips['day'] - 1))
features_map = {'temperature': 'Mean Temp Definition°C', 
                'precipitation': 'Total Precip Definitionmm'}

In [None]:
import numpy as np

for feature, heading in features_map.items():
    vals = [weather_dfs[tup[0]].iloc[tup[1]][heading] for tup in choice_tup]
    trips[feature] = [np.nan if (val == 'LegendMM' or val == 'LegendTT') else 
                      float(val) for val in vals]