# Cleaned London Count and Spatial Data - Analysis

## Rationale & Methodology

### Descriptive Analytics
- Import the necessary libraries
- Group / format data as necessary
- Discover which variables impact the total cyclist counts via heatmaps
- Fit to a multilinear regression model (with caveats, eg check for multicollinearity, breushpagan etc)
- For demographic / gender data: utilise t-tests to check if the percentage of men and women are significant year on year
- Utilise Tableau and maybe SQL to get top 10 sites etc.
- Utilise faceted plots to find as many sub-trends as possible.

### Diagnostic Analytics
- Match patterns seen in the data with research done by Javier.

### Predictive Analytics
- Fit the data into an MLR model or use ARIMA / time-series decomposition to break down the charts


In [None]:
# Import the necessary libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!pip install plotly
import plotly.express as px

In [None]:
# Load the cleaned London dataset.
london_clean = pd.read_csv("london_count_and_site_Saurav_071022.csv")

In [None]:
# Check the cleaned dataset.
london_clean.head()

From the data we can tell the following unique values
- reference: https://www.askpython.com/python/built-in-methods/unique-values-from-a-dataframe

In [None]:
# Find all the column names.
list(london_clean)

In [None]:
# Remove unnecessary column names.
london_clean.pop("Unnamed: 0")

# View the list of columns
list(london_clean)

In [None]:
# Find the unique entries of independent variables of interest.
print(pd.unique(london_clean.weather))
print(pd.unique(london_clean.survey_year))
print(pd.unique(london_clean.period))
print(pd.unique(london_clean.direction))
print(pd.unique(london_clean.day_of_week))
print(pd.unique(london_clean.season))
print(pd.unique(london_clean.surveydescription))
print(pd.unique(london_clean.borough))
print(pd.unique(london_clean.functional_cycling_area))
print(pd.unique(london_clean.month))

In [None]:
# Possible set of independent variables for investigation: 
# Weather, Period, Direction, Day of the week, Season, Functional cycling Area

# Borough Names can be used for labels.

# Utilise Macro-level year-on-year analysis first
# Then drill down month-on-month

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_weather = london_clean.groupby(["survey_year", "site_id", "weather"]).agg("sum").reset_index()
london_weather = london_weather[["survey_year","site_id","weather","total_cycles"]]

london_weather_YoY_data = pd.merge(london_weather,london_locations,how="left",on="site_id")

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html
# https://www.geeksforgeeks.org/how-to-fill-nan-values-with-mean-in-pandas/

# Heatmaps of total cyclist count by Weather condition

# Good Weather

In [None]:
fig = px.density_mapbox(london_weather_YoY_data[london_weather_YoY_data["weather"]=="Good"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "weather":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Good Weather",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Damp weather

In [None]:
fig = px.density_mapbox(london_weather_YoY_data[london_weather_YoY_data["weather"]=="Damp"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "weather":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Damp Weather",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Rainy weather

In [None]:
fig = px.density_mapbox(london_weather_YoY_data[london_weather_YoY_data["weather"]=="Rain"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "weather":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Rainy Weather",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Dangerous Weather

In [None]:
fig = px.density_mapbox(london_weather_YoY_data[london_weather_YoY_data["weather"]=="Dangerous_Conditions"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "weather":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Dangerous Weather",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Heatmaps of total cyclist count by time period

## Early Morning (6-7am)

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_period = london_clean.groupby(["survey_year", "site_id", "period"]).agg("sum").reset_index()
london_period = london_period[["survey_year","site_id","period","total_cycles"]]

london_period_YoY_data = pd.merge(london_period,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_period_YoY_data[london_period_YoY_data["period"]=="Early Morning (06:00-07:00)"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                   lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "period":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in the Early Morning",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Morning Peak Hours

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_period = london_clean.groupby(["survey_year", "site_id", "period"]).agg("sum").reset_index()
london_period = london_period[["survey_year","site_id","period","total_cycles"]]

london_period_YoY_data = pd.merge(london_period,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_period_YoY_data[london_period_YoY_data["period"]=="AM peak (07:00-10:00)"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "period":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london during Morning Peak hours",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Inter-peak Hours (10am-4pm)

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_period = london_clean.groupby(["survey_year", "site_id", "period"]).agg("sum").reset_index()
london_period = london_period[["survey_year","site_id","period","total_cycles"]]

london_period_YoY_data = pd.merge(london_period,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_period_YoY_data[london_period_YoY_data["period"]=="Inter-peak (10:00-16:00)"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "period":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london during Inter-Peak hours",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Afternoon Peak hours (4-7pm)

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_period = london_clean.groupby(["survey_year", "site_id", "period"]).agg("sum").reset_index()
london_period = london_period[["survey_year","site_id","period","total_cycles"]]

london_period_YoY_data = pd.merge(london_period,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_period_YoY_data[london_period_YoY_data["period"]=="PM peak (16:00-19:00)"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "period":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london during Afternoon Peak hours",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

## Evening hours (7pm-10pm)

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_period = london_clean.groupby(["survey_year", "site_id", "period"]).agg("sum").reset_index()
london_period = london_period[["survey_year","site_id","period","total_cycles"]]

london_period_YoY_data = pd.merge(london_period,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_period_YoY_data[london_period_YoY_data["period"]=="Evening (19:00-22:00)"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "period":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london during Evening hours",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Heatmap showing the average number of cyclists throughout the day

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_period2 = london_clean.groupby(["site_id", "period","time"]).agg("mean").reset_index()
london_period2 = london_period2.sort_values(by="time",ascending=True)
london_period2 = london_period2[["site_id","period","total_cycles"]]

london_period2_data = pd.merge(london_period2,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_period2_data,
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="period",
                        zoom=0,
                        hover_name="location",
                        hover_data={"site_id":False,
                                    "period":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Average number of cyclists throughout different times of the day",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Heatmap showing the total number of cyclists by day of the week

## Monday

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_day_of_wk = london_clean.groupby(["survey_year", "site_id", "day_of_week"]).agg("sum").reset_index()
london_day_of_wk = london_day_of_wk[["survey_year","site_id","day_of_week","total_cycles"]]

london_DOW_data = pd.merge(london_day_of_wk,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Monday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Monday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Tuesday

In [None]:
fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Tuesday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Tuesday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Wednesday

In [None]:
fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Wednesday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Wednesday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Thursday

In [None]:
fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Thursday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Thursday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Friday

In [None]:
fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Friday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Friday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Saturday 

In [None]:
fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Saturday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Saturday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Sunday

In [None]:
fig = px.density_mapbox(london_DOW_data[london_DOW_data["day_of_week"]=="Sunday"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "day_of_week":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london on Sunday",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Heatmaps showing year-on-year total cyclist counts in London by season

# Winter

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_season = london_clean.groupby(["survey_year", "site_id", "season"]).agg("sum").reset_index()
london_season = london_season[["survey_year","site_id","season","total_cycles"]]

london_season_data = pd.merge(london_season,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_season_data[london_season_data["season"]=="Winter"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "season":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Winter",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Spring

In [None]:
fig = px.density_mapbox(london_season_data[london_season_data["season"]=="Spring"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "season":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Spring",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Autumn

In [None]:
fig = px.density_mapbox(london_season_data[london_season_data["season"]=="Autumn"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "season":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Autumn",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Summer

In [None]:
fig = px.density_mapbox(london_season_data[london_season_data["season"]=="Summer"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "season":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in london in Summer",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Total Cyclist count by Functional Cycling Area

## Inner

In [None]:
london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_areas = london_clean.groupby(["survey_year", "site_id", "functional_cycling_area"]).agg("sum").reset_index()
london_areas = london_areas[["survey_year","site_id","functional_cycling_area","total_cycles"]]

london_area_data = pd.merge(london_areas,london_locations,how="left",on="site_id")

fig = px.density_mapbox(london_area_data[london_area_data["functional_cycling_area"]=="Inner"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "functional_cycling_area":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in Inner london",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Outer London

In [None]:
fig = px.density_mapbox(london_area_data[london_area_data["functional_cycling_area"]=="Outer"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "functional_cycling_area":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in Outer london",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Central London

In [None]:
fig = px.density_mapbox(london_area_data[london_area_data["functional_cycling_area"]=="Central"],
                        lat='latitude', 
                        lon='longitude', 
                        z='total_cycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "functional_cycling_area":True,
                                    "total_cycles":True,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="Year-On-Year variation in total Cyclist Counts in Central London",
                                      height=1000,
                                      width=1000,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

# Facetplots

In [None]:
list(london_clean)

In [None]:
london_clean_cycling_area = london_clean[["functional_cycling_area","borough"]]

london_grpby_bor_weather = london_clean.groupby(["survey_year","borough","period"]).agg("sum").reset_index()
london_grpby_bor_weather = london_grpby_bor_weather[["survey_year",
                                                    "borough",
                                                    "period",
                                                    "total_cycles"]]
london_grpby_bor_weather_mrg = pd.merge(london_grpby_bor_weather,london_clean_cycling_area,on="borough",how="left")

london_clean_catplot_mar = sns.catplot(
                            x="survey_year",
                            y= "total_cycles",
                            row= "borough",
                            col= "period",
                            kind="point",
                            hue="functional_cycling_area",
                            data=london_grpby_bor_weather_mrg) 

In [None]:
london_clean_cycling_area = london_clean[["functional_cycling_area","borough"]]

london_grpby_bor_period = london_clean.groupby(["survey_year","borough","period"]).agg("sum").reset_index()
london_grpby_bor_period = london_grpby_bor_weather[["survey_year",
                                                    "borough",
                                                    "period",
                                                    "total_cycles"]]

london_grpby_bor_period_mrg = pd.merge(london_grpby_bor_period,london_clean_cycling_area,on="borough",how="left")

london_clean_catplot_period = sns.catplot(
                            x="survey_year",
                            y= "total_cycles",
                            row= "borough",
                            col= "period",
                            kind="point",
                            hue="functional_cycling_area",
                            data=london_grpby_bor_weather_mrg) 

In [None]:
london_clean_cycling_area = london_clean[["functional_cycling_area","borough"]]

london_grpby_bor_weather = london_clean.groupby(["survey_year","borough","season"]).agg("sum").reset_index()
london_grpby_bor_weather = london_grpby_bor_weather[["survey_year",
                                                    "borough",
                                                    "period",
                                                    "total_cycles"]]
london_grpby_bor_weather_mrg = pd.merge(london_grpby_bor_weather,london_clean_cycling_area,on="borough",how="left")

london_clean_catplot_mar = sns.catplot(
                            x="survey_year",
                            y= "total_cycles",
                            row= "borough",
                            col= "period",
                            kind="point",
                            hue="functional_cycling_area",
                            data=london_grpby_bor_weather_mrg) 

In [None]:
london_clean_cycling_area = london_clean[["functional_cycling_area","borough"]]

london_grpby_bor_weather = london_clean.groupby(["survey_year","borough","day_of_week"]).agg("sum").reset_index()
london_grpby_bor_weather = london_grpby_bor_weather[["survey_year",
                                                    "borough",
                                                    "period",
                                                    "total_cycles"]]
london_grpby_bor_weather_mrg = pd.merge(london_grpby_bor_weather,london_clean_cycling_area,on="borough",how="left")

london_clean_catplot_mar = sns.catplot(
                            x="survey_year",
                            y= "total_cycles",
                            row= "borough",
                            col= "period",
                            kind="point",
                            hue="functional_cycling_area",
                            data=london_grpby_bor_weather_mrg) 

# Observations and Insights: Summary

# Testing for variable influence using Multilinear Regression

# Understanding demographic data

In [None]:
london_clean.head()

# Male-female gender cyclist split

In [None]:
london_clean_m_f_grpby = london_clean.groupby(["survey_year"]).agg("sum")
london_clean_m_f_grpby = london_clean_m_f_grpby.reset_index()
london_clean_m_f_grpby_subset = london_clean_m_f_grpby[["survey_year",
                                                        "number_of_male_cycles",
                                                        "number_of_female_cycles"]]
london_clean_m_f_grpby_subset = london_clean_m_f_grpby_subset.\
                                rename(columns={"number_of_male_cycles": "Male",
                                               "number_of_female_cycles":"Female"})

london_clean_m_f_grpby_subset = london_clean_m_f_grpby_subset.melt(id_vars=["survey_year"],
                                  var_name="Gender",
                                  value_name="Count")

fig, ax = (15,15)
sns.barplot(data=london_clean_m_f_grpby_subset,
           x="survey_year",
           y="Count",
           hue="Gender",
           palette="colorblind")\
.set_title("Cyclist Distribution by Gender")

# https://stackoverflow.com/questions/44548721/remove-row-with-null-value-from-pandas-data-frame

In [None]:
london_clean_m_f_grpby = london_clean.groupby(["survey_year","site_id"]).agg("sum")
london_clean_m_f_grpby = london_clean_m_f_grpby.reset_index()
london_clean_m_f_grpby_subset = london_clean_m_f_grpby[["survey_year",
                                                        "site_id",
                                                        "number_of_male_cycles",
                                                        "number_of_female_cycles"]]
london_clean_m_f_grpby_subset = london_clean_m_f_grpby_subset.\
                                rename(columns={"number_of_male_cycles": "Male",
                                               "number_of_female_cycles":"Female"})

london_clean_m_f_grpby_subset2 = london_clean_m_f_grpby_subset.melt(id_vars=["survey_year","site_id"],
                                  var_name="Gender",
                                  value_name="Count")

london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_clean_m_f_grpby_subset3 = pd.merge(london_clean_m_f_grpby_subset2,london_locations,how="left",on="site_id")

london_clean_m_f_grpby_subset3

fig = px.density_mapbox(london_clean_m_f_grpby_subset3[london_clean_m_f_grpby_subset3["Gender"]=="Female"],
                        lat='latitude', 
                        lon='longitude', 
                        z='Count', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "Count":True,
                                    "Gender":False,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="YoY variation in total Female Cyclist Counts in London",
                                      height=500,
                                      width=500,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

In [None]:
london_clean_m_f_grpby = london_clean.groupby(["survey_year","site_id"]).agg("sum")
london_clean_m_f_grpby = london_clean_m_f_grpby.reset_index()
london_clean_m_f_grpby_subset = london_clean_m_f_grpby[["survey_year",
                                                        "site_id",
                                                        "number_of_male_cycles",
                                                        "number_of_female_cycles"]]
london_clean_m_f_grpby_subset = london_clean_m_f_grpby_subset.\
                                rename(columns={"number_of_male_cycles": "Male",
                                               "number_of_female_cycles":"Female"})

london_clean_m_f_grpby_subset2 = london_clean_m_f_grpby_subset.melt(id_vars=["survey_year","site_id"],
                                  var_name="Gender",
                                  value_name="Count")

london_locations = london_clean[["site_id","location","latitude","longitude"]]
london_locations = london_locations.drop_duplicates()

london_clean_m_f_grpby_subset3 = pd.merge(london_clean_m_f_grpby_subset2,london_locations,how="left",on="site_id")

london_clean_m_f_grpby_subset3

fig = px.density_mapbox(london_clean_m_f_grpby_subset3[london_clean_m_f_grpby_subset3["Gender"]=="Male"],
                        lat='latitude', 
                        lon='longitude', 
                        z='Count', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="survey_year",
                        zoom=0,
                        hover_name="location",
                        hover_data={"survey_year":True,
                                    "site_id":False,
                                    "Count":True,
                                    "Gender":False,
                                   "location":True,
                                   "latitude":False,
                                   "longitude":False},
                                      title="YoY variation in total Male Cyclist Counts in London",
                                      height=500,
                                      width=500,
                                      opacity=1,
                                      mapbox_style="stamen-terrain")
fig.show()

In [None]:
london_clean[london_clean["functional_cycling_area"]=="Outer"].groupby(["survey_year","month"]).agg("sum")

In [None]:
pd.read_csv("Inner_London.csv")


## Utilising a t-test to validate significance in gender distribution

# Private vs bikes for-hire 

In [None]:
london_clean_prv_hire = london_clean2.groupby(["survey_year"]).agg("sum")
london_clean_prv_hire = london_clean_prv_hire.reset_index()
london_clean_prv_hire = london_clean_prv_hire[["survey_year", 
                                               "number_of_private_cycles",
                                               "number_of_cycle_hire_bikes"]]

london_clean_prv_hire = london_clean_prv_hire.rename(columns={"number_of_private_cycles": "Private",
                                               "number_of_cycle_hire_bikes":"Hire"})

london_clean_prv_hire = london_clean_prv_hire.melt(id_vars=["survey_year"],
                                  var_name="Bike_type",
                                  value_name="Count")

sns.barplot(data=london_clean_prv_hire,
           x="survey_year",
           y="Count",
           hue="Bike_type",
           palette=colorblind)\
.set_title("Cyclist Distribution by Bike_type")

# Utilising t-test to validate differences in private vs for-hire bikes

In [None]:
london_clean2_grpby = london_clean2.groupby(["Survey_wave_year",
                       "Functional_cycling_area",
                      "season"]).agg("sum")
london_clean2_grpby = london_clean2_grpby[[]].reset_index()

sns.catplot(x="Survey_wave_year",
            y="Total_cycles",
           row="Functional_cycling_area",
           col="season",
           kind="point",
           data=london_clean2_grpby)

#bicycle_merge2_catplot_mar = sns.catplot(
#                            x="SiteID",
#                            y= "Count",
#                            row= "Year",
#                            col= "Time",
#                            kind="point",
#                            hue="Month",
#                            data=bicycle_merge2[bicycle_merge2["Month"]=="March"]) 


In [None]:
#list(london_clean2)
#london_clean_m_f = london_clean2.pop("Number_of_private_cycles")
#list(london_clean2)
#london_clean2.pop("Number_of_cycle_hire_bikes")
#london_clean2.dropna(how="any",axis=0)
#list(london_clean2)

london_clean2.plot(kind="bar",
                  stacked=True,
                  color="red","blue")

#sns.barplot(data=london_clean2,
#           x="Survey_wave_year",
#           y=[["Number_of_male_cycles","Number_of_female_cycles"])



In [None]:
fig = px.density_mapbox(london_clean2[london_clean2["Weather"]=="Good"],
                        lat='latitude', 
                        lon='longitude', 
                        z='TotalCycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="Survey_wave_year",
                        zoom=0,
                        hover_name="Functional Cycling Area",
                        hover_data={'Unnamed: 0':False,
                                     'Survey_wave_year':True,
                                     'Site_ID':False,
                                     'Location_x':True,
                                     'Survey_date':False,
                                     'Weather':True,
                                     'Time':True,
                                     'Period':True,
                                     'Direction':True,
                                     'Number_of_male_cycles':True,
                                     'Number_of_female_cycles':True,
                                     'Number_of_unknown_cycles':True,
                                     'Total_cycles':True,
                                     'Day_of_week':True,
                                     'season':True,
                                     'Number_of_private_cycles':True,
                                     'Number_of_cycle_hire_bikes':True,
                                     'SurveyDescription':True,
                                     'latitude':False,
                                     'longitude':False,
                                     'Location_y':False,
                                     'Borough':True,
                                     'Functional_cycling_area':True},
                        labels={"0600-0700":"6am-7am",
                               "0700-0800":"7am-8am",
                               "0800-0900":"8am-9am",
                               "1600-1700":"4pm-5pm",
                               "1700-1800":"5pm-6pm",
                               "1800-1900":"6pm-7pm"},
                        color_continuous_midpoint=True,
                        animation_group="Intersection",
                        title="Year-On-Year variation in total Cyclist Counts in Sydney for March",
                        height=1000,
                        width=1000,
                        opacity=1,
                        category_orders={"Year":["First"],
                                        "Month":["Second"],
                                        "TotalCount":["Third"],
                                        "0600-0700":["Fourth"],
                                        "0700-0800":["Fifth"],
                                        "0800-0900":["Sixth"],
                                        "1600-1700":["Seventh"],
                                        "1700-1800":["Eighth"],
                                        "1800-1900":["Ninth"]},
                        mapbox_style="stamen-terrain")
fig.show()

Business questions 
When approaching Thoughtworks, the Mayor of London proposed some initial questions:

- How can we increase the uptake of cycling in London?
- What are the main factors that determine whether people choose to cycle?
- What interventions and changes to the transport network have had the most impact on cycling engagement? 

Other questions to consider:

- What are the demographics of cyclists in these cities? 
- Are there any underrepresented groups that can be engaged with to try to increase the uptake of cycling as a mode of transport?

# New York Data

In [None]:
# Load the ny data as a DataFrame.
ny_counts = pd.read_csv("ny_counts_Saurav_071022.csv")

# View the NY dataframe.
ny_counts.head() 

In [None]:
# Remove the first column.
ny_counts.pop("Unnamed: 0")

In [None]:
# List out the column names.
list(ny_counts)

In [None]:
# Upload the geospatial data for bicycle counters in NY.
NYC_lat_long = pd.read_csv("NYC_Bicycle_Counters_Javier_08102022.csv")

# View the dataframe.
NYC_lat_long

In [None]:
# Merge the ny_counts and location data.
NYC_clean_complete = pd.merge(ny_counts,NYC_lat_long,on="id",how="left")

# View the resultant dataframe.
NYC_clean_complete

# Remove unnecessary columns.
NYC_clean_complete.pop("domain")
NYC_clean_complete.pop("interval")
NYC_clean_complete.pop("timezone")
NYC_clean_complete.pop("sens")

# View the final NYC_clean
NYC_clean_complete

In [None]:
NYC_lat_long_clean = NYC_lat_long[["id","name","latitude","longitude","interval","counter"]]
NYC_lat_long_clean.head()

In [None]:
# Create an aggregated table to visualise the data with.
NYC_clean_agg_season = ny_counts.groupby(["year","month","season","id"]).agg("sum").reset_index( )

#View the resutlant dataframe.
NYC_clean_agg_season.head()

# Merge the aggregated table with location data.
NYC_season_final = pd.merge(NYC_clean_agg_season,NYC_lat_long_clean,on="id",how="left")
NYC_season_final2 = NYC_season_final[["year","month","season","counts"]]

# View the resultant dataframe.
NYC_season_final2

In [None]:
# Create a facetplot to show the aggregated data.
# By season.
ny_catplot_1 = sns.catplot(x= "month", 
                           y="counts",
                           row="year", 
                           hue="season",
                           kind="point",
                          data=NYC_season_final2,
                          ci=False)

In [None]:
# Create an aggregated table to visualise the data with.
NYC_clean_agg_time_of_day = ny_counts.groupby(["year","month","time_of_day"]).agg("sum").reset_index( )

#View the resutlant dataframe.
NYC_clean_agg_time_of_day.head()

# Merge the aggregated table with location data.
NYC_TOD_final = pd.merge(NYC_clean_agg_time_of_day,NYC_lat_long_clean,on="id",how="left")
NYC_TOD_final2 = NYC_TOD_final[["year","month","time_of_day","counts"]]

# View the resultant dataframe.
NYC_TOD_final2

# By time period.
ny_catplot_2 = sns.catplot(data=NYC_TOD_final2, 
                           x="time_of_day", 
                           y="counts",
                           row="year", 
                           col="month", 
                           kind="point")

In [None]:
# Create an aggregated table to visualise the data with.
NYC_clean_agg_DOW = ny_counts.groupby(["year","month","day_of_week"]).agg("sum").reset_index( )

#View the resutlant dataframe.
NYC_clean_agg_DOW.head()

# Merge the aggregated table with location data.
NYC_DOW_final = pd.merge(NYC_clean_agg_DOW,NYC_lat_long_clean,on="id",how="left")
NYC_DOW_final2 = NYC_DOW_final[["year","month","day_of_week","counts"]]

# View the resultant dataframe.
NYC_DOW_final2

# By time period.
ny_catplot_3 = sns.catplot(data=NYC_DOW_final2, 
                           x="day_of_week", 
                           y="counts",
                           row="year", 
                           col="month", 
                           kind="point")

In [None]:
# Macro view 
sns.lineplot(data=ny_counts, x=month_year, y= total_counts)

# Predictive Analytics
# Create a multilinear regression model showcasing how timing affects things.
# Wait wouldnt they be collinear...and htat would stuff up the model. sigh

In [None]:
# use t-tests, mlr, and scenaroi analysis

In [None]:
NYC_merged = pd.merge(by_counts,NYC_lat_long,on="id",how="left")

fig = px.density_mapbox(london_clean2[london_clean2["Weather"]=="Good"],
                        lat='latitude', 
                        lon='longitude', 
                        z='TotalCycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="Survey_wave_year",
                        zoom=0,
                        hover_name="Functional Cycling Area",
                        hover_data={'Unnamed: 0':False,
                                     'Survey_wave_year':True,
                                     'Site_ID':False,
                                     'Location_x':True,
                                     'Survey_date':False,
                                     'Weather':True,
                                     'Time':True,
                                     'Period':True,
                                     'Direction':True,
                                     'Number_of_male_cycles':True,
                                     'Number_of_female_cycles':True,
                                     'Number_of_unknown_cycles':True,
                                     'Total_cycles':True,
                                     'Day_of_week':True,
                                     'season':True,
                                     'Number_of_private_cycles':True,
                                     'Number_of_cycle_hire_bikes':True,
                                     'SurveyDescription':True,
                                     'latitude':False,
                                     'longitude':False,
                                     'Location_y':False,
                                     'Borough':True,
                                     'Functional_cycling_area':True},
                        labels={"0600-0700":"6am-7am",
                               "0700-0800":"7am-8am",
                               "0800-0900":"8am-9am",
                               "1600-1700":"4pm-5pm",
                               "1700-1800":"5pm-6pm",
                               "1800-1900":"6pm-7pm"},
                        color_continuous_midpoint=True,
                        animation_group="Intersection",
                        title="Year-On-Year variation in total Cyclist Counts in Sydney for March",
                        height=1000,
                        width=1000,
                        opacity=1,
                        category_orders={"Year":["First"],
                                        "Month":["Second"],
                                        "TotalCount":["Third"],
                                        "0600-0700":["Fourth"],
                                        "0700-0800":["Fifth"],
                                        "0800-0900":["Sixth"],
                                        "1600-1700":["Seventh"],
                                        "1700-1800":["Eighth"],
                                        "1800-1900":["Ninth"]},
                        mapbox_style="stamen-terrain")
fig.show()

# Impact of Covid on cycling

In [None]:
london_covid = pd.read_csv("london_plus_covid_vanessa_081022.csv")
london_covid.head()
print(pd.unique(london_covid["stay_home_requirements"]))

In [None]:
london_covid = pd.read_csv("london_plus_covid_vanessa_081022.csv")

london_covid_grpby = london_covid.groupby(["survey_year","covid_status","site_id"]).agg("sum").reset_index()
london_covid_grpby = london_covid_grpby[["survey_year","covid_status","site_id","total_cycles"]]
london_covid_grpby_mrg = pd.merge(london_covid_grpby, london_locations,on="site_id",how="left")

fig = px.density_mapbox(london_covid_grpby_,rg,
                        lat='latitude', 
                        lon='longitude', 
                        z='TotalCycles', 
                        radius=10,
                        center=dict(lat=0, 
                                    lon=180), 
                        animation_frame="covid_status",
                        zoom=0,
                        hover_name="Functional Cycling Area",
                        hover_data={'Unnamed: 0':False,
                                     'Survey_wave_year':True,
                                     'Site_ID':False,
                                     'Location_x':True,
                                     'Survey_date':False,
                                     'Weather':True,
                                     'Time':True,
                                     'Period':True,
                                     'Direction':True,
                                     'Number_of_male_cycles':True,
                                     'Number_of_female_cycles':True,
                                     'Number_of_unknown_cycles':True,
                                     'Total_cycles':True,
                                     'Day_of_week':True,
                                     'season':True,
                                     'Number_of_private_cycles':True,
                                     'Number_of_cycle_hire_bikes':True,
                                     'SurveyDescription':True,
                                     'latitude':False,
                                     'longitude':False,
                                     'Location_y':False,
                                     'Borough':True,
                                     'Functional_cycling_area':True},
                        labels={"0600-0700":"6am-7am",
                               "0700-0800":"7am-8am",
                               "0800-0900":"8am-9am",
                               "1600-1700":"4pm-5pm",
                               "1700-1800":"5pm-6pm",
                               "1800-1900":"6pm-7pm"},
                        color_continuous_midpoint=True,
                        animation_group="Intersection",
                        title="Year-On-Year variation in total Cyclist Counts in Sydney for March",
                        height=1000,
                        width=1000,
                        opacity=1,
                        category_orders={"Year":["First"],
                                        "Month":["Second"],
                                        "TotalCount":["Third"],
                                        "0600-0700":["Fourth"],
                                        "0700-0800":["Fifth"],
                                        "0800-0900":["Sixth"],
                                        "1600-1700":["Seventh"],
                                        "1700-1800":["Eighth"],
                                        "1800-1900":["Ninth"]},
                        mapbox_style="stamen-terrain")
fig.show()

In [None]:
!pip install gis

In [None]:
!pip install arcgis

In [None]:
from arcgis.gis import GIS
my_gis = GIS()
#m = my_gis.map()
#m.add_layer("london_plus_covid_vanessa_081022.csv")

m = my_gis.map("London")
m.add_layer()

# Making heatmaps with predictive models 

In [None]:
# adding car and rivate vehicle information by borough

prvt_cars_london = pd.read_csv("private_cars_london.csv")
traffic_boro_london = pd.read_csv("traffic_flow_borough.csv")
prvt_vehicle_boro_london = pd.read_csv("private_vehicles_by_borough_london.csv")

In [None]:
#prvt_cars_london.melt(id_vars=["Year"],var_name="Borough", value_name="No of cars")
#traffic_boro_london.melt(id_vars="Year",var_name="Borough",value_name="Traffic_count")
prvt_vehicle_boro_london_unpivot = prvt_vehicle_boro_london.melt(id_vars="Borough",var_name="Year",value_name="Private_vehicles").sort_values(by=["Borough","Year"],ascending=[True,True])
prvt_vehicle_boro_london_unpivot["Pri_key"] = prvt_vehicle_boro_london_unpivot["Borough"] + "_" + prvt_vehicle_boro_london_unpivot["Year"] 
prvt_vehicle_boro_london_unpivot
#prvt_cars_london.T

#london_clean_m_f_grpby_subset2 = london_clean_m_f_grpby_subset.melt(id_vars=["survey_year","site_id"],
#                                  var_name="Gender",
#                                  value_name="Count")

# So gotta join these borough data to cyclist counts in London



In [None]:
london_boro_complete = london_clean[["borough","latitude","longitude","survey_year","location","number_of_male_cycles","number_of_female_cycles","number_of_private_cycles","number_of_cycle_hire_bikes","total_cycles","functional_cycling_area"]].fillna(0)
london_boro_complete["PK"] = london_boro_complete["borough"] + "_" + london_boro_complete["survey_year"].astype("string")
london_boro_complete.merge()
#list(london_clean )
#pd.merge