## Note: Please ensure you run this workbook from the same folder as where the underlying data files are stored in your computer.

# Preparing the Environment for Exploratory Data Analysis

In [1]:
# Imports of Models, Libraries & Date Parsing Functionality 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm 
import nltk
import os
import string   
import re
import math
import squarify
import plotly.express as px
import requests

# Copy the YAML file and Twitter keys over to this Jupyter Notebook before you start to work.
import yaml
from yaml.loader import SafeLoader
from twitter import *
import tweepy

from pywaffle import Waffle
from datetime import datetime
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import column_or_1d
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cdist
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
from textblob import TextBlob
from scipy.stats import norm
from collections import Counter

from scipy.ndimage import gaussian_gradient_magnitude
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import warnings
warnings.filterwarnings('ignore')

# Setting up a date parser using a private funciton, lambda
# This will give us the dates in a format we require for aggregation & indexation
d_parser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p')

In [2]:
# Load the CSV file(s) for count data only.
ny_counts = pd.read_csv('NY_Bicycle_Counts.csv', parse_dates=['date'], date_parser=d_parser)
inner_london = pd.read_csv('Inner_London.csv')
central_london = pd.read_csv('Central_London.csv')
outer_london = pd.read_csv('Outer_London.csv')
sydney_counts = pd.read_csv('Sydney_count_surveys.csv')

# Data Wrangling

## Explore the data

In [3]:
# View the DataFrame, metadata, shape
ny_counts.info()
inner_london.info()
central_london.info()
outer_london.info()
sydney_counts.info()

# ny_counts is a large but simple Df containing the number of bicycles passing through a counter
# No missing data in ny_counts  
# Non-Null values for London DataFrames(Df) across all columns dont add up. 
# Implies missing data.
# Will need to explore further.
# Central London is another very large Df
# Large Dfs need to be trimmed for unncessary data to reduce strain on memory use.
# Sydney observations are just aggregated by month and SiteID
# Sydney observations run for selected hours and not all day (looks like peak hours only)
# Sydney also has no missing values.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4167507 entries, 0 to 4167506
Data columns (total 4 columns):
 #   Column  Dtype         
---  ------  -----         
 0   id      int64         
 1   date    datetime64[ns]
 2   counts  int64         
 3   status  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 127.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615168 entries, 0 to 615167
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Survey wave (year)          523776 non-null  float64
 1   Site ID                     523776 non-null  object 
 2   Location                    523776 non-null  object 
 3   Survey date                 521024 non-null  object 
 4   Weather                     519102 non-null  object 
 5   Time                        523770 non-null  object 
 6   Period                      523770 non-null  object 
 7   Direction              

In [4]:
# Get more precise handle on missing values in each Df
inner_london.isnull().sum()

Survey wave (year)            91392
Site ID                       91392
Location                      91392
Survey date                   94144
Weather                       96066
Time                          91398
Period                        91398
Direction                     91392
Start hour                    91398
Start minute                  91398
Number of private cycles      91392
Number of cycle hire bikes    91392
Total cycles                  91392
dtype: int64

In [5]:
# Get more precise handle on missing values in each Df
central_london.isnull().sum()

Survey wave (calendar quarter)     290203
Equivalent financial quarter       290203
Site ID                            290203
Location                           290203
Survey date                        300359
Weather                            302037
Time                               290203
Period                             290203
Direction                          290203
Start hour                         290203
Start minute                       290203
Number of private cycles           290267
Number of cycle hire bikes         290267
Total cycles                       290203
Unnamed: 14                       1048366
Unnamed: 15                       1048366
Unnamed: 16                       1048366
dtype: int64

In [6]:
# Get more precise handle on missing values in each Df
outer_london.isnull().sum()

Survey wave (year)             0
Site ID                        0
Location                       0
Survey date                 1168
Weather                      968
Time                           0
Period                         0
Direction                      0
Start hour                     0
Start minute                   0
Number of male cycles          0
Number of female cycles        0
Number of unknown cycles       0
Total cycles                   0
dtype: int64

In [7]:
# Look at samples of the data
ny_counts.head(5)

# Very sparse but clean data
# Id is site Id for where the counter is located
# Data runs until mid June 2022. 
# So data is very recent

Unnamed: 0,id,date,counts,status
0,100009425,2022-06-24 00:00:00,15,0
1,100009425,2022-06-24 00:15:00,12,0
2,100009425,2022-06-24 00:30:00,14,0
3,100009425,2022-06-24 00:45:00,5,0
4,100009425,2022-06-24 01:00:00,10,0


In [8]:
# Exploring why NY data is so large.
# View Tail to see how long the data runs until
ny_counts.tail(5)

# Data starts from mid Dec 2012
# Need to explore how far the other data sets run until

Unnamed: 0,id,date,counts,status
4167502,100005020,2012-12-12 02:45:00,3,4
4167503,100005020,2012-12-12 03:00:00,2,4
4167504,100005020,2012-12-12 03:15:00,3,4
4167505,100005020,2012-12-12 03:30:00,1,4
4167506,100005020,2012-12-12 03:45:00,2,4


In [9]:
# Look at samples of the data
inner_london.head(5)

# Date has french word in it. Needs to cleaned.

Unnamed: 0,Survey wave (year),Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6.0,0.0,1.0,0.0,1.0
1,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6.0,15.0,2.0,0.0,2.0
2,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6.0,30.0,2.0,0.0,2.0
3,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6.0,45.0,4.0,0.0,4.0
4,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7.0,0.0,4.0,0.0,4.0


In [10]:
# Look at samples of the data
central_london.head(5)

# Data similar in format to inner london but has some extra columns.
# Will need to trim this Df to concatenate
# Explore whether we need the extra columns here and if not will trim

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6.0,0.0,0.0,0.0,0.0,,,
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6.0,15.0,15.0,0.0,15.0,,,
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6.0,30.0,35.0,0.0,35.0,,,
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6.0,45.0,59.0,2.0,61.0,,,
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7.0,0.0,73.0,0.0,73.0,,,


In [11]:
# Look at samples of the data
outer_london.head(5)

# Matches format of inner london.

Unnamed: 0,Survey wave (year),Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of male cycles,Number of female cycles,Number of unknown cycles,Total cycles
0,2015,OUTCY001,High Road Leyton,"ven, 26/06/15",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6,0,2,1,0,3
1,2015,OUTCY001,High Road Leyton,"ven, 26/06/15",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6,15,3,0,0,3
2,2015,OUTCY001,High Road Leyton,"ven, 26/06/15",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6,30,2,0,0,2
3,2015,OUTCY001,High Road Leyton,"ven, 26/06/15",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6,45,4,0,0,4
4,2015,OUTCY001,High Road Leyton,"ven, 26/06/15",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7,0,4,1,0,5


In [12]:
# Look at samples of the data
sydney_counts.head(5)

Unnamed: 0,SiteID,Month,Year,TotalCount,ObjectId2,Time_0600,Time_0700,Time_0800,Time_1600,Time_1700,Time_1800
0,51,March,2010,263,1,12,45,56,27,56,67
1,1,October,2015,383,2,37,69,100,47,68,62
2,52,March,2010,136,3,7,18,31,29,30,21
3,53,March,2010,333,4,25,86,93,15,62,52
4,2,October,2015,447,5,32,75,72,56,114,98


## Filter & Clean Data

In [13]:
# Rename columns to remove space in column names
inner_london.columns = inner_london.columns.str.replace(' ','_')
central_london.columns = central_london.columns.str.replace(' ','_')
outer_london.columns = outer_london.columns.str.replace(' ','_')
sydney_counts.columns = sydney_counts.columns.str.replace(' ','_')

In [14]:
# Remove French Day name in Survey Date 
inner_london["Survey_date"] = inner_london["Survey_date"].str.replace(r'\D+', '', regex=True)
central_london["Survey_date"] = central_london["Survey_date"].str.replace(r'\D+', '', regex=True)
outer_london["Survey_date"] = outer_london["Survey_date"].str.replace(r'\D+', '', regex=True)

In [15]:
# Drop empty columns
ny_counts.dropna(how='all', axis=1, inplace=True)
inner_london.dropna(how='all', axis=1, inplace=True)
central_london.dropna(how='all', axis=1, inplace=True)
outer_london.dropna(how='all', axis=1, inplace=True)

In [16]:
# Parse London Df Dates into appropriate format
# Convert to datetime format
inner_london['Survey_date'] = pd.to_datetime(inner_london.Survey_date)
central_london['Survey_date'] = pd.to_datetime(central_london.Survey_date)
outer_london['Survey_date'] = pd.to_datetime(outer_london.Survey_date)

# Sort DataFrame by chronological order
inner_london = inner_london.sort_values(['Survey_date', 'Start_hour', 'Start_minute'])
central_london = central_london.sort_values(['Survey_date', 'Start_hour', 'Start_minute'])
outer_london = outer_london.sort_values(['Survey_date', 'Start_hour', 'Start_minute'])

# Replace with value of previous value row where there is a missing value in Survey_date
inner_london['Survey_date'].fillna(method='ffill', inplace=True)
central_london['Survey_date'].fillna(method='ffill', inplace=True)
outer_london['Survey_date'].fillna(method='ffill', inplace=True)

# Define day of the week in English and add back as a column
inner_london['Day_of_week'] = inner_london['Survey_date'].dt.day_name()
central_london['Day_of_week'] = central_london['Survey_date'].dt.day_name()
outer_london['Day_of_week'] = outer_london['Survey_date'].dt.day_name()

# Pass Month into a new column
inner_london['month'] = inner_london['Survey_date'].dt.month
central_london['month'] = central_london['Survey_date'].dt.month
outer_london['month'] = outer_london['Survey_date'].dt.month

In [17]:
# Pass Season
# Create function for user defined seasons
def f(x):
    if (x >= 1) and (x <= 2):
        return 'Winter'
    elif (x > 2) and (x <= 5 ):
        return 'Spring'
    elif (x > 5) and (x <= 8):
        return'Summer'
    elif (x > 8) and (x <= 11) :
        return 'Autumn'
    elif (x > 11):
        return'Winter'

# Apply user defined function to create new column with seasons
inner_london['season'] = inner_london['month'].apply(f)
central_london['season'] = central_london['month'].apply(f)
outer_london['season'] = outer_london['month'].apply(f)

In [18]:
# Cleanup NY Dates
# Make sure that the data is sorted in cbronological order without altering anything else for the moment
ny_counts.sort_values(by='date')
# Can see data set runs from 31 Aug 2012 to 04 Jul 2022
# Can extract hour of the day to look for patterns. Expect peak vs off peak patterns
# We can also group the data by day of the week to look for patterns within that
# We can also see that its possible to group the data into month to look for seasonal patterns
# We can also group/subset the data by id which is another spatial/location identifier

Unnamed: 0,id,date,counts,status
2059019,100047029,2012-08-31 00:00:00,41,4
3205445,100062893,2012-08-31 00:00:00,41,4
4164087,100005020,2012-08-31 00:00:00,41,4
2370586,100051865,2012-08-31 00:00:00,41,4
4165734,100005020,2012-08-31 00:15:00,52,4
...,...,...,...,...
16707,100062893,2022-07-04 23:45:00,34,0
16803,300020241,2022-07-04 23:45:00,16,0
16995,300024007,2022-07-04 23:45:00,2,0
16323,100047029,2022-07-04 23:45:00,34,0


In [19]:
# Extract year information from timestamped date column
ny_counts['year'] = ny_counts['date'].dt.year

# Extract hour information first from timestamped date column
ny_counts['hour'] = ny_counts['date'].dt.hour

In [20]:
# Create user defined function for peak and off peak hours to match London Data
def f(x):
    if (x > 6) and (x <= 7):
        return 'Early Morning'
    elif (x > 7) and (x <= 10 ):
        return 'AM peak'
    elif (x > 10) and (x <= 16):
        return'Inter Peak'
    elif (x > 16) and (x <= 19) :
        return 'PM Peak'
    elif (x > 19) and (x <= 23):
        return'Evening'
    elif (x <= 6):
        return'Night'
    
# Apply user defined function to create new column with peak and off peak hours
ny_counts['time_of_day'] = ny_counts['hour'].apply(f)

In [21]:
# Define day of the week and add as column to match London Data
ny_counts = ny_counts.assign(day_of_week = lambda x: x.date.dt.day_name())

In [22]:
# Define month to calculate user defined seasons to match London
# Extract month information first from timestamped date column
ny_counts['month'] = ny_counts['date'].dt.month

# Create user defined function for seasons
def f(x):
    if (x >= 1) and (x <= 2):
        return 'Winter'
    elif (x > 2) and (x <= 5 ):
        return 'Spring'
    elif (x > 5) and (x <= 8):
        return'Summer'
    elif (x > 8) and (x <= 11) :
        return 'Autumn'
    elif (x > 11):
        return'Winter'

# Apply user defined function to create new column with seasons
ny_counts['season'] = ny_counts['month'].apply(f)

In [23]:
# Reshaping the order of the columns so they flow more logically within the DataFrame
neworder = ['id', 'date', 'day_of_week', 'month', 'season', 'year', 'hour', 'time_of_day', 'counts', 'status']
ny_counts = ny_counts.reindex(columns=neworder)

In [24]:
# Investigate in Ny_counts what status there is
print(ny_counts.status.unique())

# As both status as within acceptable boundaries can drop status column

[0 4]


In [25]:
# Dropping redundant columns
ny_counts=ny_counts.drop(['status'], axis=1)

In [26]:
# Rename the Sydney columns to align to Peak/off Peak of other Df.
sydney_counts.rename(columns = {"Time_0600":"Early_Morning",
                              "Time_0700":"AM_Peak1",
                              "Time_0800":"AM_Peak2",
                              "Time_1600":"PM_Peak1",
                              "Time_1700":"PM_Peak2",
                              "Time_1800":"PM_Peak3"},
                             inplace=True)

# Add All AM Peak and PM Peak Columns into 2 columns only to align to the way other Dfs are presented
sydney_counts['AM_Peak'] = sydney_counts['AM_Peak1'] + sydney_counts['AM_Peak2'] 
sydney_counts['PM_Peak'] = sydney_counts['PM_Peak1'] + sydney_counts['PM_Peak2'] + sydney_counts['PM_Peak3']

# View Output with 5 random samples 
sydney_counts.sample(5)

# Can consolidate some columns

Unnamed: 0,SiteID,Month,Year,TotalCount,ObjectId2,Early_Morning,AM_Peak1,AM_Peak2,PM_Peak1,PM_Peak2,PM_Peak3,AM_Peak,PM_Peak
1716,57,October,2016,78,1773,9,6,15,9,19,20,21,48
1885,55,March,2021,247,1963,15,40,44,41,63,44,84,148
837,100,March,2019,378,838,41,55,82,48,74,78,137,200
2040,19,October,2017,303,2118,27,71,93,20,43,49,164,112
1328,65,March,2014,1528,1329,136,318,363,144,288,279,681,711


In [27]:
# Drop rows where total cycles doesnt add up to number of private and hire cycles.
# Creating a column to add up the values
sydney_counts['Sum'] = sydney_counts['AM_Peak'] + sydney_counts['PM_Peak'] + sydney_counts['Early_Morning']

# Dropping rows where the column values dont add up 
sydney_counts = sydney_counts[sydney_counts.TotalCount == sydney_counts.Sum]

In [28]:
sydney_counts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2186 entries, 0 to 2214
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   SiteID         2186 non-null   int64 
 1   Month          2186 non-null   object
 2   Year           2186 non-null   int64 
 3   TotalCount     2186 non-null   int64 
 4   ObjectId2      2186 non-null   int64 
 5   Early_Morning  2186 non-null   int64 
 6   AM_Peak1       2186 non-null   int64 
 7   AM_Peak2       2186 non-null   int64 
 8   PM_Peak1       2186 non-null   int64 
 9   PM_Peak2       2186 non-null   int64 
 10  PM_Peak3       2186 non-null   int64 
 11  AM_Peak        2186 non-null   int64 
 12  PM_Peak        2186 non-null   int64 
 13  Sum            2186 non-null   int64 
dtypes: int64(13), object(1)
memory usage: 256.2+ KB


In [29]:
# Dropping redundant columns
sydney_counts=sydney_counts.drop(['ObjectId2', 'AM_Peak1', 'AM_Peak2', 'PM_Peak1', 'PM_Peak2', 'PM_Peak3', 'Sum'], axis=1)

In [30]:
# Drop rows where total cycles doesnt add up to number of private and hire cycles.
# Creating a column to add up the values
inner_london['Sum'] = inner_london['Number_of_private_cycles'] + inner_london['Number_of_cycle_hire_bikes']

# Dropping rows where the column values dont add up 
inner_london = inner_london[inner_london.Total_cycles == inner_london.Sum]

In [31]:
# Drop rows where total cycles doesnt add up to number of private and hire cycles.
# Creating a column to add up the values
central_london['Sum'] = central_london['Number_of_private_cycles'] + central_london['Number_of_cycle_hire_bikes']

# Dropping rows where the column values dont add up 
central_london = central_london[central_london.Total_cycles == central_london.Sum]

In [32]:
# Drop rows where total cycles doesnt add up to number of private and hire cycles.
# Creating a column to add up the values
outer_london['Sum'] = outer_london['Number_of_male_cycles'] + outer_london['Number_of_female_cycles'] + outer_london['Number_of_unknown_cycles']

# Dropping rows where the column values dont add up 
outer_london = outer_london[outer_london.Total_cycles == outer_london.Sum]

In [33]:
# View all other dataframes to quickly review their structure before merging
inner_london.sample(5)

# Can consolidate Columns
# Need to remove formatting for survey_wave_(year)

Unnamed: 0,Survey_wave_(year),Site_ID,Location,Survey_date,Weather,Time,Period,Direction,Start_hour,Start_minute,Number_of_private_cycles,Number_of_cycle_hire_bikes,Total_cycles,Day_of_week,month,season,Sum
201273,2017.0,INNCY379,Acacia Road,2017-05-07,Dry,2015 - 2030,Evening (19:00-22:00),Northbound,20.0,15.0,0.0,0.0,0.0,Sunday,5,Spring,0.0
320209,2019.0,INNCY114,Cromwell Road,2019-07-15,Dry,1015 - 1030,Inter-peak (10:00-16:00),Westbound,10.0,15.0,0.0,0.0,0.0,Monday,7,Summer,0.0
151602,2016.0,INNCY588,Tooting Bec Common,2016-04-19,Dry,1830 - 1845,PM peak (16:00-19:00),Northbound,18.0,30.0,2.0,0.0,2.0,Tuesday,4,Spring,2.0
261986,2018.0,INNCY256,Lowndes Street,2018-06-22,Dry,1430 - 1445,Inter-peak (10:00-16:00),Southbound,14.0,30.0,0.0,0.0,0.0,Friday,6,Summer,0.0
218485,2017.0,INNCY513,Tangley Grove,2017-05-17,Dry,1915 - 1930,Evening (19:00-22:00),Southbound,19.0,15.0,0.0,0.0,0.0,Wednesday,5,Spring,0.0


In [34]:
# Drop the decimal point in Survey_wave
inner_london['Survey_wave_(year)'] = inner_london['Survey_wave_(year)'].astype(str).apply(lambda x: x.replace('.0','')).astype(int)

In [35]:
# Dropping redundant columns
inner_london=inner_london.drop(['Sum', 'Start_hour', 'Start_minute'], axis=1)

In [36]:
# View all other dataframes to quickly review their structure before merging
central_london.sample(5)

# Can consolidate/drop some Columns
# Need to remove formatting for survey_wave_(year)

Unnamed: 0,Survey_wave_(calendar_quarter),Equivalent_financial_quarter,Site_ID,Location,Survey_date,Weather,Time,Period,Direction,Start_hour,Start_minute,Number_of_private_cycles,Number_of_cycle_hire_bikes,Total_cycles,Day_of_week,month,season,Sum
23911,2014 Q1 (January-March),2013-14 Q4,CENCY190,Hastings Street,2014-01-29,Wet,1545 - 1600,Inter-peak (10:00-16:00),Westbound,15.0,45.0,1.0,0.0,1.0,Wednesday,1,Winter,1.0
87238,2014 Q4 (October-December),2014-15 Q3,CENCY083,Borough High Street,2014-10-11,Dry,0730 - 0745,AM peak (07:00-10:00),Northbound,7.0,30.0,109.0,2.0,111.0,Saturday,10,Autumn,111.0
476653,2018 Q3 (July-September),2018-19 Q2,CENCY087,Lower Thames Street,2018-07-17,Dry,1715 - 1730,PM peak (16:00-19:00),Eastbound,17.0,15.0,134.0,19.0,153.0,Tuesday,7,Summer,153.0
482198,2018 Q3 (July-September),2018-19 Q2,CENCY130,Gresham Street,2018-09-25,Dry,1130 - 1145,Inter-peak (10:00-16:00),Westbound,11.0,30.0,11.0,2.0,13.0,Tuesday,9,Autumn,13.0
663948,2020 Q3 (July-September),2020-21 Q2,CENCY082,Long Lane,2020-09-15,Dry,0900 - 0915,AM peak (07:00-10:00),Westbound,9.0,0.0,9.0,3.0,12.0,Tuesday,9,Autumn,12.0


In [37]:
# Drop the additional information in Survey wave column 
# alligns with the way this info is presented in the other London datasets.
central_london["Survey_wave_(calendar_quarter)"] = central_london["Survey_wave_(calendar_quarter)"].str.replace(r'\D+', '', regex=True)

# Drop the last number in every row 
central_london['Survey_wave_(calendar_quarter)'] = central_london['Survey_wave_(calendar_quarter)'].astype(str).str[:-1].astype(np.int64)

# Rename Column Name to align with other London Datasets
central_london.rename(columns={'Survey_wave_(calendar_quarter)': 'Survey_wave_(year)'}, inplace = True)

In [38]:
# Dropping redundant columns
central_london=central_london.drop(['Sum', 'Start_hour', 'Start_minute', 'Equivalent_financial_quarter'], axis=1)

In [39]:
# View all other dataframes to quickly review their structure before merging
outer_london.sample(5)

# Can consolidate Columns

Unnamed: 0,Survey_wave_(year),Site_ID,Location,Survey_date,Weather,Time,Period,Direction,Start_hour,Start_minute,Number_of_male_cycles,Number_of_female_cycles,Number_of_unknown_cycles,Total_cycles,Day_of_week,month,season,Sum
146491,2017,OUTCY243,Elmfield Avenue,2017-06-20,Dry,2045 - 2100,Evening (19:00-22:00),Northbound,20,45,0,0,0,0,Tuesday,6,Summer,0
73336,2016,OUTCY122,Wickham Road,2016-04-27,Wet,2000 - 2015,Evening (19:00-22:00),Southbound,20,0,0,0,0,0,Wednesday,4,Spring,0
115441,2016,OUTCY451,Thames Path (Lower Ham Road),2016-09-05,Wet,1815 - 1830,PM peak (16:00-19:00),Southbound,18,15,3,0,0,3,Monday,9,Autumn,3
219654,2018,OUTCY364,Gorringe Park Avenue,2018-07-19,Dry,0730 - 0745,AM peak (07:00-10:00),Eastbound,7,30,0,0,0,0,Thursday,7,Summer,0
57413,2015,OUTCY449,Glen Walk,2015-03-06,Dry,0715 - 0730,AM peak (07:00-10:00),Southbound,7,15,6,0,0,6,Friday,3,Spring,6


In [40]:
# Dropping redundant columns
outer_london=outer_london.drop(['Sum', 'Start_hour', 'Start_minute'], axis=1)

In [41]:
# Count unique values in each column
for col in inner_london:
  print(col,": ", inner_london[col].nunique())

# More site ids vs location
# May imply multiple sites in same location. Does this double count? Need to check!
# Survey period of over 7 years
# 5 Periods of day which should be synched in same fashion with all the other city count data
# will use london period of day definition as base.
# 165 types of weather needs to be consolidated into more manageable fashion

Survey_wave_(year) :  7
Site_ID :  597
Location :  584
Survey_date :  463
Weather :  165
Time :  64
Period :  5
Direction :  4
Number_of_private_cycles :  221
Number_of_cycle_hire_bikes :  28
Total_cycles :  233
Day_of_week :  7
month :  12
season :  4


In [42]:
# Explore the different types of Weather
print(inner_london.Weather.unique())

# Lots of overlaps for e.g. Rain & Wet, Dry/cold and dry Cold.
# Need to classify into much narrower streams. 

['Dry' 'Windy/rain' nan 'Wet' 'Rain' 'Drizzle' 'Sunny' 'Overcast' 'Cloudy'
 'Fine' 'Cloudy/sunny' 'Dry Wet Road' 'Cloudy/rain' 'Cloudy/dry'
 'Dry & Windy' 'Mizzle' 'High Wind' 'Dry/sunny' 'Dry/sun' 'S.wet' 'S/w'
 'Sun' 'Wet/damp' 'Shower' 'Druy' 'Mix Wet/dry' 'Wet/dry' 'Very Windy'
 'Dry                         9' 'Dry/hot' 'Dark/cloudy' 'Dry/overcast'
 'Warm + Dry' 'Dry Warm' 'Light Showers' 'Showers' 'Light Rain' 'Spitting'
 'Wet (shower)' 'Down Pour' 'Heavy Rain' 'Shower/dry' 'Hail Stone' 'Sleet'
 'Snow' 'Damp' 'Thunder' 'Fair' 'Rain/sleet' 'Too Cold' 'Dry Cold' 'Hot'
 'Dull' 'Sun/cloudy' 'Wet/mix' 'Heavy Thunder' 'Drizzle/cloudy' 'Dry/wet'
 'Overcast/rain Heavy Showers' 'Overcast/dry' 'Bright/dry' 'Cloud'
 'Dull/damp' 'Dry/drizzle' 'Dry-wet' 'Dry Sunny' 'Rain Shower' 'Dry/cold'
 'Hail' 'Wet Road' 'Drizzle/dry' 'Drizzle/rain' 'Intermittent Showers'
 'Dry/v. Windy' 'Dry Windy' 'Windy' 'N/a' 'V Light Drizzle' 'D' 'W'
 'Drizzle/wet' 'Rainy' 'Warm/dry' 'Wet/windy' 'Heavy Rain High Winds

In [43]:
# Consolidate descriptions in Weather
# Rain
inner_london['Weather'] = inner_london['Weather'].replace(['Wet','Cloudy/rain','Rain','Mix Wet/dry','Drizzle',
                                                          'Light Showers', 'Mizzle','Windy/rain','Showers',
                                                          'Wet/dry','Wet/damp','Shower','Drizzle/shower','Rainy',
                                                          'wet','Cloudy with showers','Generally overcast brief shower'
                                                          'Light Rain','Shower/dry','Spitting','Drizzle/cloudy',
                                                          'Dry/wet','Damp', 'Dry/drizzle','Dull/damp','Dry-wet',
                                                          'Wet/mix', 'Drizzle/wet','Wet/windy','Rain Shower',
                                                          'Intermittent Showers','Cloudy/drizzle','Rain/drizzle',
                                                          'Wet Road','Drizzle/dry','Drizzle/rain','Mixed Sunny + Rain',
                                                          'Wet/rain', 'V Light Drizzle', 'Rainy', 'W','Slight Drizzle',
                                                          'Rain Stopped', 'Stopped Raining','Wet Rain Stopped','Raining/wet',
                                                          'Showery','Overcast/rain','Rain/wet','Rain/showers','Showers/sunny',
                                                          'Drizzle/showers','Wet/stop Raining','Drizzle Rain','Drizzle Wet',
                                                          'Damp/sun','Raining','Dry + Wet','Showers/cloudy','Cloudy/showers',
                                                          'Getting Wet','Wet Road:sun','Dry But Wet Road','Drizze',
                                                          'wet','Wettish','Light Rain','S.wet','S/w',
                                                          'Heavy Rain','Heavy Shower','Heavy Shr','Down Pour',
                                                           'Deluge','Heavy Showers', 'Shower','Rain Heavy Showers',
                                                           'Intermitent Showers','Thunder Lightening Rain!','Very Wet',
                                                           'V.wet','Heavy Downpour/rain','Showery','Wet Heavy Rain',
                                                           'Wet (heavy Rain)','Wet (shower)'],'Rain')

# Good
inner_london['Weather'] = inner_london['Weather'].replace(['Sunny','Cloudy Sunny','Sun Setting','Good','Dry/sunny',
                                                          'Fine + Dry', 'Fine + Hot','Bright','Dry Hot!!',
                                                          'Dry & Sunny','Dry & Sun','Fine & Dry','Good/dry','Sun',
                                                          'Sunny Dry','Clear and Bright', 'Fine', 'Dry/good', 
                                                          'Fine/dry', 'Warm + Dry','Dry','Dry                         9',
                                                          'Sunny','Cloudy/sunny','Druy','Dry/hot','Dry Warm',
                                                          'Dry/sun','Dryish','Clear And Dry','Clear and Dry','Dry, Warm',
                                                          'Dry, Sunny, Warm','Cloudy with Clear Intervals','Clear and Warm',
                                                          'Dry But Misty','Sunny & Warm All Day','Clear','Dry + Sunny',
                                                          'Sunny/dry','Dr Ry','Dry Y','D','Warm/dry','Bright/dry','Dry Sunny',
                                                          'Fair','Dry/sun','Cloudy','Sunny Overcast Sunny','Sunny/cloudy','Cloudy/rain/sunny',
                                                           'Cloudy + Sunny','Sunny + Cloudy', 'Cloudy/sunny',
                                                           'Bright + Cloudy','Cloudy/dry','Partly Sunny','Dull','Dry & Mild',
                                                           'Cloud','Overcast','Mild','Overcast (No Rain)',
                                                          'Cloudy bright intervals','Generally overcast',
                                                           'Cloudy with clear spells','Sunny Overcast','Dry',
                                                           'Dry/mild', 'Clear','Cloudy and Dry','Partly cloudy but dry',
                                                          'Partly cloudy and dry','Cloudy but dry','Partly cloudy and Dry',
                                                          'Sun/Cloudy','Clouds & Sunny','Sun/clouds','Cloudy & Sunny',
                                                          'Sun & Clouds','Cloudy Dry','Cloud/sun','Mixed','Sun/cloud',
                                                           'Sunny/cloudy','Cloudy Sun','Cloudy/sun','Dry/cloudy',
                                                           'Sun/cloudy','Overcast/dry','Cloud','Dull','Dry/overcast',
                                                          'Dark/cloudy','Cloudy/dry','Cloudy'],'Good')



# Damp
inner_london['Weather'] = inner_london['Weather'].replace(['Wet/dry','Intermittent Light Drizzle','Light Rain',
                                                           'Lt Rain','Drizzle','Intermittent Drizzle', 'Damp','Getting Dry',
                                                           'Dry & Wet','Slight Drizzle/dry','Dry Road Still Wet'],'Damp')

# Dangerous Conditions
inner_london['Weather'] = inner_london['Weather'].replace(['Heavy Rain','Dry/wet Road','Dry With Wet Road',
                                                           'Hot','Snow!','Snow', 'Sleet','Very Hot',
                                                           'Dry (road Wet)','Dry, Sunny, Hot','Very Heavy Rain',
                                                           'Intermittent Heavy Showers','Very Hot/dry','Hot/dry',
                                                           'Storm','Heavy Rain High Winds','V Wet','Rain Heavy',
                                                          'Sunny (hot!)','Heavy Thunder','Overcast/rain Heavy Showers',
                                                          'Too Cold','High Wind','Very Windy','Wet/windy','Wet/v.windy',
                                                           'Wet Hail','Rain/hail','Foggy Wet',
                                                           'Wet Heavy Wind', 'Wet-windy','Hailstones',
                                                           'Short Hail Shower','Rain/sleet','Hail Stone',
                                                          'Hail','Showers/hailstone','Rain/hailstone','Dry Chill','Dry/cold',
                                                           'Dry Cold','Cold/sunny','Cold/cloudy',
                                                           'Dry Very Windy', 'Dry/windy','Windy','Cold','Cloudy/windy',
                                                           'Windy + Sunny','Sunsetting + Windy','Dark Cloudy',
                                                           'Dry V. Cold!','Very Cool','Dry & Windy',
                                                          'Dry but Cold or Wind','Dry/v. Windy','Dry Windy',
                                                          'Windy At First Then Sunny','Windy Dry',
                                                          'Dry Wet Road','Thunder'],'Dangerous_Conditions')

# Consolidating 'Unknown'
inner_london['Weather'] = inner_london['Weather'].replace(['School Out','N/a','Unknown'],'Unknown')

# Transforming Nan Values into Unknown
# Replacing nan with 'Unknown'
inner_london.Weather = inner_london.Weather.fillna('Unknown')

# Consolidating "Dry Dark" into "Unknown"
inner_london['Weather'] = inner_london['Weather'].replace(['Dry Dark','Dry/dark','Dark/dry',
                                                           'Dark Dry'],'Unknown')

In [44]:
# Count unique values in each column
for col in central_london:
  print(col,": ", central_london[col].nunique())

# More site ids vs location again.
# May imply multiple sites in same location. Does this double count? Need to check!
# Survey period of 8 years
# 5 Periods of day which should be synched in same fashion with all the other city count data
# will use london period of day definition as base.
# 282 types of weather needs to be consolidated

Survey_wave_(year) :  8
Site_ID :  210
Location :  205
Survey_date :  1596
Weather :  283
Time :  64
Period :  5
Direction :  4
Number_of_private_cycles :  435
Number_of_cycle_hire_bikes :  87
Total_cycles :  479
Day_of_week :  7
month :  12
season :  4


In [45]:
# Consolidate descriptions in Weather
# Rain
central_london['Weather'] = central_london['Weather'].replace(['Wet','Cloudy/rain','Rain','Mix Wet/dry','Drizzle',
                                                          'Light Showers', 'Mizzle','Windy/rain','Showers',
                                                          'Wet/dry','Wet/damp','Shower','Drizzle/shower','Rainy',
                                                          'wet','Cloudy with showers','Generally overcast brief shower'
                                                          'Light Rain','Shower/dry','Spitting','Drizzle/cloudy',
                                                          'Dry/wet','Damp', 'Dry/drizzle','Dull/damp','Dry-wet',
                                                          'Wet/mix', 'Drizzle/wet','Wet/windy','Rain Shower',
                                                          'Intermittent Showers','Cloudy/drizzle','Rain/drizzle',
                                                          'Wet Road','Drizzle/dry','Drizzle/rain','Mixed Sunny + Rain',
                                                          'Wet/rain', 'V Light Drizzle', 'Rainy', 'W','Slight Drizzle',
                                                          'Rain Stopped', 'Stopped Raining','Wet Rain Stopped','Raining/wet',
                                                          'Showery','Overcast/rain','Rain/wet','Rain/showers','Showers/sunny',
                                                          'Drizzle/showers','Wet/stop Raining','Drizzle Rain','Drizzle Wet',
                                                          'Damp/sun','Raining','Dry + Wet','Showers/cloudy','Cloudy/showers',
                                                          'Getting Wet','Wet Road:sun','Dry But Wet Road','Drizze',
                                                          'wet','Wettish','Light Rain','S.wet','S/w','Cold/rain',
                                                           'Slightly Wet','Road Wet','Light Shower','Rain Damp','Wet Damp',
                                                              'Wet - Dry','Dry - Wet','Rain Dry','Dry - Rain','Damp - Rain',
                                                              'Wet/ Dry','S. Wet','Cloudy/ Rain','Windy/ Rain','Wet T',
                                                              'Some Showers','Rains','Sunny/rainy','Wetr','Showers Mix',
                                                              'Rain/dry','Rain/cloudy','Shower/wet','Wetter',
                                                              'Heavy Rain','Heavy Shower','Heavy Shr','Down Pour',
                                                           'Deluge','Heavy Showers', 'Shower','Rain Heavy Showers',
                                                           'Intermitent Showers','Thunder Lightening Rain!','Very Wet',
                                                           'V.wet','Heavy Downpour/rain','Showery','Wet Heavy Rain',
                                                           'Wet (heavy Rain)','Wet (shower)','Blustery','V. Wet',
                                                              'Rain & Thunder','Rain-heavy','H Rain','Wert','(rain After)',
                                                              'Cloud/rain','Really Wet','Periods Of Rain Quite Windy',
                                                              'Steady Rain'],'Rain')

# Good
central_london['Weather'] = central_london['Weather'].replace(['Sunny','Cloudy Sunny','Sun Setting','Good','Dry/sunny',
                                                          'Fine + Dry', 'Fine + Hot','Bright','Dry Hot!!',
                                                          'Dry & Sunny','Dry & Sun','Fine & Dry','Good/dry','Sun',
                                                          'Sunny Dry','Clear and Bright', 'Fine', 'Dry/good', 
                                                          'Fine/dry', 'Warm + Dry','Dry','Dry                         9',
                                                          'Sunny','Cloudy/sunny','Druy','Dry/hot','Dry Warm',
                                                          'Dry/sun','Dryish','Clear And Dry','Clear and Dry','Dry, Warm',
                                                          'Dry, Sunny, Warm','Cloudy with Clear Intervals','Clear and Warm',
                                                          'Dry But Misty','Sunny & Warm All Day','Clear','Dry + Sunny',
                                                          'Sunny/dry','Dr Ry','Dry Y','D','Warm/dry','Bright/dry','Dry Sunny',
                                                          'Fair','Dry/sun','Kdry','Fine Windy',
                                                               'Cloudy','Sunny Overcast Sunny','Sunny/cloudy',
                                                               'Cloudy/rain/sunny','Cloudy + Sunny','Sunny + Cloudy',
                                                               'Cloudy/sunny','Bright + Cloudy','Cloudy/dry',
                                                               'Partly Sunny','Dull','Dry & Mild','Cloud','Overcast',
                                                               'Mild','Overcast (No Rain)','Cloudy bright intervals',
                                                               'Generally overcast','Cloudy with clear spells',
                                                               'Sunny Overcast','Dry','Dry/mild', 'Clear',
                                                               'Cloudy and Dry','Partly cloudy but dry',
                                                          'Partly cloudy and dry','Cloudy but dry','Partly cloudy and Dry',
                                                          'Sun/Cloudy','Clouds & Sunny','Sun/clouds','Cloudy & Sunny',
                                                          'Sun & Clouds','Cloudy Dry','Cloud/sun','Mixed','Sun/cloud',
                                                           'Sunny/cloudy','Cloudy Sun','Cloudy/sun','Dry/cloudy',
                                                           'Sun/cloudy','Overcast/dry','Cloud','Dull','Dry/overcast',
                                                          'Dark/cloudy','Cloudy/dry','Cloudy','Hazy','Partly Cloudy',
                                                               'Drty','Dry (windy)','Fine (windy)','Sunny Cloudy',
                                                              'Dry Dark','Dark','Dry Mon','Dry Wed','Dry Thu','Dry Fri',
                                                              'Sun/rain','Thunder','Cloudy','Sunny Overcast Sunny',
                                                               'Sunny/cloudy','Cloudy/rain/sunny',
                                                           'Cloudy + Sunny','Sunny + Cloudy', 'Cloudy/sunny',
                                                           'Bright + Cloudy','Cloudy/dry','Partly Sunny','Dull','Dry & Mild',
                                                           'Cloud','Overcast','Mild','Overcast (No Rain)',
                                                          'Cloudy bright intervals','Generally overcast',
                                                           'Cloudy with clear spells','Sunny Overcast','Dry',
                                                           'Dry/mild', 'Clear','Cloudy and Dry','Partly cloudy but dry',
                                                          'Partly cloudy and dry','Cloudy but dry','Partly cloudy and Dry',
                                                          'Sun/Cloudy','Clouds & Sunny','Sun/clouds','Cloudy & Sunny',
                                                          'Sun & Clouds','Cloudy Dry','Cloud/sun','Mixed','Sun/cloud',
                                                           'Sunny/cloudy','Cloudy Sun','Cloudy/sun','Dry/cloudy',
                                                           'Sun/cloudy','Overcast/dry','Cloud','Dull','Dry/overcast',
                                                          'Dark/cloudy','Cloudy/dry','Cloudy','Hazy','Partly Cloudy',
                                                               'Drty','Dry (windy)','Fine (windy)','Sunny Cloudy',
                                                              'Dry Dark','Dark','Dry Mon','Dry Wed','Dry Thu','Dry Fri',
                                                              'Sun/rain','Thunder','Ddry','Dy','Dry/sunny/cold','Fine Cold',
                                                              'Cold Dry','Dry & Cold','Dry And Fine','Dry And Sunny',
                                                              'Dry And Warm','Fine And Dry','Warm + Sunny','Warm And Humid',
                                                              'Warm And Windy','Overcast And Dull','Cloudy And Warm',
                                                              'Sunny Periods And Warm','Dry And Windy','Dry And Very Windy',
                                                              'Warm Sunny And Windy','Hot And Humid','Mild And Sunny',
                                                               'Warm And Overcast','Sunny & Windy','Windy/cloudy',
                                                              'Dry/gusty','Coldish','Windy/dry','Dry But A Bit Windy',
                                                               'Sunny Cold','Cold At First Then Warm/sunny',
                                                              'Warm & Sunny Chilly Later','Fine + Dry Chilly At First',
                                                               'Fine & Sunny','dry','A Bit Chilly At First',
                                                               'Warm With A Slight Wind','Cold Then Dry And Windy',
                                                               'Dry And Overcast','Warm + Sunny Cloudy + Windy',
                                                              'Dry 3/4 Dry','Sunny Until Evening But Windy',
                                                               'Winds Rather Chilly','Warm','Sunny But Very Windy',
                                                               'Now Starts To Get Chilly'],'Good')


# Light Rain
central_london['Weather'] = central_london['Weather'].replace(['Wet/dry','Intermittent Light Drizzle','Light Rain',
                                                           'Lt Rain','Drizzle','Intermittent Drizzle', 'Damp','Getting Dry',
                                                           'Dry & Wet','Slight Drizzle/dry','Wet Intermittently',
                                                               'Light Rain','V Light Rain','Dry Wet Road','Dry A.m Wet P.m',
                                                               'Mist','Road Drying Sun Out','Wetish','Light Shrs',
                                                              'Fine Drizzle','V Light Shrs','L/rain','Rain Stopped-dry',
                                                              'V Lt Rain','V.light Rain','Dry (+brief Speels Of Drizzle',
                                                              'Wet (spitting)','Drizzly Rain','Almost Dry','Damp & Drizzly',
                                                              'Dry Road Wet With Leaves','Wet Drizzle','No Rain Wet Roads',
                                                              'Dry But Wet Roads','Very Light Rain','Light Drizzle',
                                                              'Dry/wet Road Surface','V Light Showers','V. Light Rain',
                                                              'Wet/cloudy','Wet/sunny','Dry Road Still Wet',
                                                              '2 Snowflakes Otherwise Dry','Wet-dry','Dry/drizzly',
                                                              'Wet/light Showers','Wet/drizzle','Wet And Windy',
                                                              'Drizzling','Drizzle Damp','Windy Showery','Wet + Dry',
                                                              'V.light Drizzle','Very Light Drizzle','Drying Up','Wet Again',
                                                              'Cold Sunny Rain','Wet First Then Dry','Wetr First Then Dry',
                                                              'Dry With Intermitent Rain','(drizzle)','Damp/misty/wet',
                                                              'Dry But Rain Threatening','Slight Drizzle Till End',
                                                              'Damp/misty','Cold & Dry Early Rain Later',
                                                              'Wet ','Windy/drizzle','Intermitent Light Showers',
                                                              'Intermitent Light Rain','A Few Rain Showers','Drizzly',
                                                              'Rain Looking Likely','A Few Drops Of Rain'],'Damp')

# Dangerous Weather
central_london['Weather'] = central_london['Weather'].replace(['Heavy Rain','Dry/wet Road','Dry With Wet Road',
                                                           'Hot','Snow!','Snow', 'Sleet','Very Hot',
                                                           'Dry (road Wet)','Dry, Sunny, Hot','Very Heavy Rain',
                                                           'Intermittent Heavy Showers','Very Hot/dry','Hot/dry',
                                                           'Storm','Heavy Rain High Winds','V Wet','Rain Heavy',
                                                          'Sunny (hot!)','Heavy Thunder','Overcast/rain Heavy Showers',
                                                          'Too Cold','High Wind','Very Windy','Dry & Very Windy',
                                                              'Very Hot Dry','Wet/windy','Wet/v.windy','Wet Hail',
                                                               'Rain/hail','Foggy Wet',
                                                           'Wet Heavy Wind', 'Wet-windy','Hailstones',
                                                           'Short Hail Shower','Rain/sleet','Hail Stone',
                                                          'Hail','Showers/hailstone','Rain/hailstone','Cold/ Rain',
                                                              'Foggy','Wet & Windy','Wet + Windy','Rain/wind',
                                                              'Wet (windy)','Occasional Lt Snow Shrs',
                                                              'Wet And Very Windy','Dry Chill','Dry/cold','Dry Cold',
                                                               'Cold/sunny','Cold/cloudy',
                                                           'Dry Very Windy', 'Dry/windy','Windy','Cold','Cloudy/windy',
                                                           'Windy + Sunny','Sunsetting + Windy','Dark Cloudy',
                                                           'Dry V. Cold!','Very Cool','Dry & Windy',
                                                          'Dry but Cold or Wind','Dry/v. Windy','Dry Windy',
                                                          'Windy At First Then Sunny','Windy Dry','Cold Windy Dry',
                                                              'Cold/dry','Some Heavy Showers','Very Cold/dry',
                                                              'Foggy/v Cold','Hail Shower','Snowing','Wet/ Snowing',
                                                              'Heavy Snow','Dry/very Windy','Very Windy & Cold',
                                                              'Wet Light Hailstone','Heavy Showers Throughout Day',
                                                              'High Winds & Spits Of Rain','Fine V Cold',
                                                              'Dry (frost & Fog)','V Cold Showers','Cold/showery',
                                                              'Light Showers Inc Some Hail','Cloudy/hail','Cold Wind',
                                                              'Hot & Sunny','Hot And Sunny','Dry/windy/strong Wind',
                                                              'Hot + Humid','Very Cold Sunny But Windy'],'Dangerous_Conditions')

# Consolidating 'Unknown'
central_london['Weather'] = central_london['Weather'].replace(['School Out','N/a','Unknown','Dark Sunny',
                                                              'Wed','Warm & Sunny But Windy & Cold'],'Unknown')

# Transforming Nan Values into Unknown
# Replacing nan with 'Unknown'
central_london.Weather = central_london.Weather.fillna('Unknown')

# Consolidating "Dry Dark"
central_london['Weather'] = central_london['Weather'].replace(['Dry Dark','Dry/dark','Dark/dry',
                                                           'Dark Dry', 'X'],'Unknown')

In [46]:
# Count unique values in each column
for col in outer_london:
  print(col,": ", outer_london[col].nunique())

# More site ids vs location
# May imply multiple sites in same location. Need to check for double count.
# Survey period of over 7 years
# 5 Periods of day which should be synched in same fashion with all the other city count data
# will use london period of day definition as base.
# 123 types of weather needs to be consolidated

Survey_wave_(year) :  7
Site_ID :  451
Location :  431
Survey_date :  435
Weather :  124
Time :  64
Period :  5
Direction :  4
Number_of_male_cycles :  55
Number_of_female_cycles :  20
Number_of_unknown_cycles :  11
Total_cycles :  65
Day_of_week :  7
month :  12
season :  4


In [47]:
# Consolidate descriptions in Weather
# Rain
outer_london['Weather'] = outer_london['Weather'].replace(['Wet','Showers','Rain','Cloudy + Rain','Rain & Cloudy',
                                                          'Raining', 'Rain/cloudy','Wet/thunder','Light Showers',
                                                          'Rain/showers','W','Wey','Drizzle/shower','Rainy',
                                                          'wet','Cloudy with showers','Generally overcast brief shower',
                                                          'Heavy Rain','Heavy Shower','Heavy Shr','Down Pour',
                                                           'Deluge','Heavy Showers', 'Shower','Rain Heavy Showers',
                                                           'Intermitent Showers','Thunder Lightening Rain!','Very Wet',
                                                           'V.wet','Heavy Downpour/rain','Showery','Wet Heavy Rain',
                                                           'Wet (heavy Rain)'],'Rain')

# Good
outer_london['Weather'] = outer_london['Weather'].replace(['Cloudy','Sunny Overcast Sunny','Sunny/cloudy','Cloudy/rain/sunny',
                                                           'Cloudy + Sunny','Sunny + Cloudy', 'Cloudy/sunny',
                                                           'Bright + Cloudy','Cloudy/dry','Partly Sunny','Dull','Dry & Mild',
                                                           'Cloud','Overcast','Mild','Overcast (No Rain)',
                                                          'Cloudy bright intervals','Generally overcast',
                                                           'Cloudy with clear spells','Sunny Overcast','Dry',
                                                           'Dry/mild', 'Clear','Cloudy with clear spells',
                                                          'Sunny Overcast', 'Sunny','Cloudy Sunny','Sun Setting','Good',
                                                           'Dry/sunny','Fine + Dry', 'Fine + Hot','Bright','Dry Hot!!',
                                                          'Dry & Sunny','Dry & Sun','Fine & Dry','Good/dry','Sun',
                                                          'Sunny Dry','Clear and Bright', 'Fine', 'Dry/good', 
                                                          'Fine/dry','Warm + Dry','D'],'Good')
# Damp
outer_london['Weather'] = outer_london['Weather'].replace(['Wet/dry','Intermittent Light Drizzle',
                                                           'Light Rain','Lt Rain','Drizzle','Intermittent Drizzle', 'Damp',
                                                           'Getting Dry','Dry & Wet','Dry/wet',],'Damp')


# Dangerous Conditions
outer_london['Weather'] = outer_london['Weather'].replace(['Dry Chill','Dry/cold','Dry Cold','Cold/sunny','Cold/cloudy',
                                                           'Dry Very Windy', 'Dry/windy','Windy','Cold','Cloudy/windy',
                                                           'Windy + Sunny','Sunsetting + Windy','Dark Cloudy',
                                                           'Dry V. Cold!','Very Cool','Wet/windy','Wet/v.windy','Wet Hail',
                                                           'Rain/hail','Foggy Wet','Wet Heavy Wind', 'Wet-windy','Hailstones',
                                                           'Short Hail Shower','Heavy Rain','Dry/wet Road','Dry With Wet Road',
                                                           'Hot','Snow!','Snow', 'Sleet','Very Hot','Dry (road Wet)'],
                                                          'Dangerous_Conditions')

# Replacing nan with 'Unknown'
outer_london.Weather = outer_london.Weather.fillna('Unknown')

# Consolidating "Unknown"
outer_london['Weather'] = outer_london['Weather'].replace(['Dry Dark','Dry/dark','Dark/dry','Dark Dry', 'N/a'],'Unknown')

### Summarising Null Values

In [48]:
# Check missing values
ny_counts.isnull().sum()

# No Null Values

id             0
date           0
day_of_week    0
month          0
season         0
year           0
hour           0
time_of_day    0
counts         0
dtype: int64

In [49]:
# Check missing values
sydney_counts.isnull().sum()

# No Null Values

SiteID           0
Month            0
Year             0
TotalCount       0
Early_Morning    0
AM_Peak          0
PM_Peak          0
dtype: int64

In [50]:
# Check missing values
inner_london.isnull().sum()

Survey_wave_(year)            0
Site_ID                       0
Location                      0
Survey_date                   0
Weather                       0
Time                          6
Period                        6
Direction                     0
Number_of_private_cycles      0
Number_of_cycle_hire_bikes    0
Total_cycles                  0
Day_of_week                   0
month                         0
season                        0
dtype: int64

In [51]:
# Check missing values
central_london.isnull().sum()

# No Null Values

Survey_wave_(year)            0
Site_ID                       0
Location                      0
Survey_date                   0
Weather                       0
Time                          0
Period                        0
Direction                     0
Number_of_private_cycles      0
Number_of_cycle_hire_bikes    0
Total_cycles                  0
Day_of_week                   0
month                         0
season                        0
dtype: int64

In [52]:
# Check missing values
outer_london.isnull().sum()

# No Null Values

Survey_wave_(year)          0
Site_ID                     0
Location                    0
Survey_date                 0
Weather                     0
Time                        0
Period                      0
Direction                   0
Number_of_male_cycles       0
Number_of_female_cycles     0
Number_of_unknown_cycles    0
Total_cycles                0
Day_of_week                 0
month                       0
season                      0
dtype: int64

In [53]:
# Review all metadata again 
ny_counts.info()
inner_london.info()
central_london.info()
outer_london.info()
sydney_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4167507 entries, 0 to 4167506
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   day_of_week  object        
 3   month        int64         
 4   season       object        
 5   year         int64         
 6   hour         int64         
 7   time_of_day  object        
 8   counts       int64         
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 286.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 523624 entries, 36864 to 378879
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Survey_wave_(year)          523624 non-null  int32         
 1   Site_ID                     523624 non-null  object        
 2   Location                    523624 non-null  object        
 3   Survey_da

## Merging DataFrames

### Concatenate London DataFrames

In [54]:
# Merging London Dataframes first to check output as this is the base Df. 
london = pd.concat([outer_london, central_london, inner_london])

In [55]:
# View Metadata
london.info()

# London rows all add up correctly so merge is successful. 
# Expecting some null values now as inner and central london dfs didnt have gender info
# Expecting some null values now as outer london df didnt have cycle ownership/rental info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1657382 entries, 6144 to 378879
Data columns (total 17 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   Survey_wave_(year)          1657382 non-null  int64         
 1   Site_ID                     1657382 non-null  object        
 2   Location                    1657382 non-null  object        
 3   Survey_date                 1657382 non-null  datetime64[ns]
 4   Weather                     1657382 non-null  object        
 5   Time                        1657376 non-null  object        
 6   Period                      1657376 non-null  object        
 7   Direction                   1657382 non-null  object        
 8   Number_of_male_cycles       375659 non-null   float64       
 9   Number_of_female_cycles     375659 non-null   float64       
 10  Number_of_unknown_cycles    375659 non-null   float64       
 11  Total_cycles          

In [56]:
# Check missing values
london.isnull().sum()

# All aligns as expected. 
# Will drop the 6 rows where period and time has null values
# 6 rows being deleted will not impact data with 1.287mio rows and doesnt merit time to investigate these.
# Gender & Cycle ownership data was already missing from some of the original data.
# The missing rows add up to the sum of the original data where the data was missing

Survey_wave_(year)                  0
Site_ID                             0
Location                            0
Survey_date                         0
Weather                             0
Time                                6
Period                              6
Direction                           0
Number_of_male_cycles         1281723
Number_of_female_cycles       1281723
Number_of_unknown_cycles      1281723
Total_cycles                        0
Day_of_week                         0
month                               0
season                              0
Number_of_private_cycles       375659
Number_of_cycle_hire_bikes     375659
dtype: int64

In [57]:
# Dropping the 6 rows where missing values in Time column
london = london.dropna(subset=['Time'])

In [58]:
# Merging the Count Data from london with spatial data
# Pull in the Spatial data
bike_site = pd.read_excel("London_Biking_sites_reconv.xlsx")

# Contains additional data 
# The base data's spatial information has been converted into Longtitude & Latitude

In [59]:
# Look at Metadata
bike_site.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023 entries, 0 to 2022
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UnqID                    2023 non-null   object 
 1   ProgID                   2023 non-null   object 
 2   SurveyDescription        2023 non-null   object 
 3   Easting                  2023 non-null   float64
 4   Northing                 2023 non-null   float64
 5   latitude                 2023 non-null   float64
 6   longitude                2023 non-null   float64
 7   Location                 2023 non-null   object 
 8   Borough                  2023 non-null   object 
 9   Functional cycling area  2021 non-null   object 
dtypes: float64(4), object(6)
memory usage: 158.2+ KB


In [60]:
# Check missing values
bike_site.isnull().sum()

UnqID                      0
ProgID                     0
SurveyDescription          0
Easting                    0
Northing                   0
latitude                   0
longitude                  0
Location                   0
Borough                    0
Functional cycling area    2
dtype: int64

In [61]:
# View Data Sample
bike_site.sample(5)

Unnamed: 0,UnqID,ProgID,SurveyDescription,Easting,Northing,latitude,longitude,Location,Borough,Functional cycling area
1310,OUTCY027,OUTCY,Outer area cycle surveys,525837.763292,169102.364207,51.40699,-0.192136,Morden Road,Merton,Outer
1905,QWPCY171,QWPCY,Quietway cycle surveys,512260.47,181056.74,51.51729,-0.383476,Grand Union Canal (Spikes Bridge moorings),Ealing,Outer
1484,OUTCY201,OUTCY,Outer area cycle surveys,517645.371641,185310.044807,51.55442,-0.304482,Hawardene Road,Brent,Outer
1989,QWPCY255,QWPCY,Quietway cycle surveys,514728.0,182197.0,51.527041,-0.347551,Ruislip Road East,Ealing,Outer
43,CENCY044,CENCY,Central area cycle surveys,531647.29,181040.0,51.512958,-0.104224,New Bridge Street,City of London,Central


In [62]:
# Renaming Column on spatial data to align with Count Data
bike_site.rename(columns = {"UnqID": "Site_ID"},inplace=True)

In [63]:
# Rename columns to remove space in column names
bike_site.columns = bike_site.columns.str.replace(' ','_')

In [64]:
# Can merge spatial data with count data using site_id as key
london_complete = pd.merge(london, bike_site, on="Site_ID", how="left")

In [65]:
# View Metadata of combined df
london_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1657376 entries, 0 to 1657375
Data columns (total 26 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   Survey_wave_(year)          1657376 non-null  int64         
 1   Site_ID                     1657376 non-null  object        
 2   Location_x                  1657376 non-null  object        
 3   Survey_date                 1657376 non-null  datetime64[ns]
 4   Weather                     1657376 non-null  object        
 5   Time                        1657376 non-null  object        
 6   Period                      1657376 non-null  object        
 7   Direction                   1657376 non-null  object        
 8   Number_of_male_cycles       375659 non-null   float64       
 9   Number_of_female_cycles     375659 non-null   float64       
 10  Number_of_unknown_cycles    375659 non-null   float64       
 11  Total_cycles            

In [68]:
# View Sample
london_complete.sample(2)

Unnamed: 0,Survey_wave_(year),Site_ID,Location_x,Survey_date,Weather,Time,Period,Direction,Number_of_male_cycles,Number_of_female_cycles,...,Day_of_week,month,season,Number_of_private_cycles,Number_of_cycle_hire_bikes,SurveyDescription,latitude,longitude,Borough,Functional_cycling_area
1655524,2017,INNCY454,Upland Road,2021-12-07,Unknown,1100 - 1115,Inter-peak (10:00-16:00),Eastbound,,,...,Tuesday,12,Winter,0.0,0.0,Inner area cycle surveys,51.457236,-0.06882,Southwark,Inner
1105315,2021,CENCY197,Juxton Street,2021-10-09,Good,1545 - 1600,Inter-peak (10:00-16:00),Eastbound,,,...,Saturday,10,Autumn,2.0,0.0,Central area cycle surveys,51.494097,-0.115769,Lambeth,Central


In [67]:
# Remove overlapping spatial data
london_complete = london_complete.drop(['ProgID', 'Easting', 'Northing', 'Location_y'], axis=1)

In [69]:
# Change all column names to lower case to ease recalling them for analysis
london_complete = london_complete.rename(columns=str.lower)

In [72]:
# Rename some columns to more logical names without changing underlying data dictionary
london_complete = london_complete.rename(columns={'location_x': 'location', 'survey_wave_(year)': 'survey_year'})

In [73]:
london_complete.sample(2)

Unnamed: 0,survey_year,site_id,location,survey_date,weather,time,period,direction,number_of_male_cycles,number_of_female_cycles,...,day_of_week,month,season,number_of_private_cycles,number_of_cycle_hire_bikes,surveydescription,latitude,longitude,borough,functional_cycling_area
1233971,2016,INNCY367,Brisbane Avenue,2016-04-28,Good,0715 - 0730,AM peak (07:00-10:00),Southbound,,,...,Thursday,4,Spring,0.0,0.0,Inner area cycle surveys,51.414532,-0.19542,Merton,Inner
1639010,2021,INNCY194,Mortimer Road,2021-07-21,Good,1030 - 1045,Inter-peak (10:00-16:00),Westbound,,,...,Wednesday,7,Summer,1.0,0.0,Inner area cycle surveys,51.531983,-0.218202,Brent,Inner


In [75]:
# Store the combined dataframe as a new CSV for backup
london_complete.to_csv('london_count_and_site_Saurav_071022.csv')

# Exploring Twitter Data about Cycling

Task here is to extract the most recent posts on Twitter about Cycling. From these, the objectives are:
- Plot on a map where the topic is most Trending and to see if any of the three cities being analysed are amongst them
- Get an overview of the sentiment expressed in such posts
- Get an overview of the most common words used in such posts

In [None]:
# Copy the YAML file and Twitter keys over to this Jupyter Notebook before you start to work.
# Import the yaml file - remember to specify the whole path and use / between directories
twitter_creds = yaml.safe_load(open('twitter_tmp.yaml', 'r').read())

# To investigate the Tweets & Sentiment Analysis

nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
# Pass Tweepy credentials
tweepy_api = Twitter(auth=OAuth(twitter_creds['access_token'],
                                 twitter_creds['access_token_secret'], 
                                 twitter_creds['api_key'],
                                 twitter_creds['api_secret_key'] ))

In [None]:
# Pass your Twitter credentials.
twitter_api = Twitter(auth=OAuth(twitter_creds['access_token'],
                                 twitter_creds['access_token_secret'], 
                                 twitter_creds['api_key'],
                                 twitter_creds['api_secret_key'] ))

In [None]:
# Check Tweepy connection
print(tweepy_api)

In [None]:
# Check Twitter connection
print(twitter_api)

In [None]:
# Look for the term cycling on Twitter
# Ordered by recency
# Filtering for only those Tweets which have point coordinates or twitter places

# Note that the free Twitter API being used here only allows access to the seven most recent days of tweets

q = {'q':'cycling', 'count':100, 'result_type':'recent', 'has':'geo'}

# Results as an empty list.
results = []

while len(results) < 40:
    query = twitter_api.search.tweets(**q)
    q['max_id'] = query['search_metadata']['next_results'].split('&')[0].split('?max_id=')[1]
    results.append(query)
    
# Determine the number of results.
len(results)

In [None]:
# Flatten the results in a DataFrame for NLP Analysis
df2 = pd.concat([pd.DataFrame(_['statuses']) for _ in results])

In [None]:
# Check Metadata
df2.info()

# Not much geo data captured. 
# Leave and return later 
# Check with Norah

In [None]:
# Determine values of the output
df2 = df2['text'].values

In [None]:
# View one raw result
# Confirms that the body of the post is stored here
df2[1]

## Prepare the data for NLP & Sentiment Analysis

In [None]:
# Introduce Stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# Split up each tweet into individual words.
df2_token = [word_tokenize(_) for _ in df2]

# Not viewing all output to reduce size of workbook
# Just viewing one output instead
df2_token[5]

In [None]:
# Get a list of all English words so we can exclude anything that doesn't appear on the list.
all_english_words = set(words.words())

In [None]:
# Some pre-processing:
# Get every word.
# Convert it to lowercase.
# Only include if the word is alphanumeric and if it is in the list of English words.

df2_token_nostop =\
[[y.lower() for y in x if y.lower() not in stop_words and y.isalpha() and y.lower() in all_english_words]\
 for x in df2_token]

In [None]:
# View output
df2_token_nostop[2]

In [None]:
# Create a variable to store the Sentiment Intensity Analyser
darth_vader = SentimentIntensityAnalyzer()

In [None]:
# Run through a dictionary comprehension to take every cleaned tweet. 
# Next run the polarity score function on the string.
# This will return four values in a dictionary.

df2_polarity =\
{" ".join(_) : darth_vader.polarity_scores(" ".join(_)) for _ in df2_token_nostop}

In [None]:
# Convert the list of dictionary results to a pandas dataframe. 
# The index is the cleaned tweet.

polarity_pd = pd.DataFrame(df2_polarity).T

# View the Dataframe
polarity_pd

# Compound score indicates actual sentiment

### Visualising the data with Charts

Sentiment

In [None]:
# Visualise the output in a distribution
%matplotlib inline
import matplotlib.pyplot as plt

_plot = polarity_pd.reset_index()['compound'].sort_values()
ax1 = plt.axes()
_plot.plot(kind='bar')

x_axis = ax1.axes.get_xaxis()
x_axis.set_visible(False)

plt.show()
plt.close()

- Most values are 0 (neutral) - blank spaces
- More positive sentiment than negative sentiment amongst non-neutral values
- Some very strong positive sentiment > 0.75
- No very strong negative sentiment visible < -0.75

A histogram plot will visualise the distribution of sentiment better although the strictly neutral values should be removed which will help make the histogram clearer.

In [None]:
# Remove polarity values equal to zero 
# This will improve the scale of the histogram and remove all strictly neutral reviews from the analysis
# This will better highlight the distribution of polarity values = sentiment
polarity_pd['compound'] = polarity_pd['compound'][polarity_pd['compound'] != 0]

In [None]:
# Visualise the distribution of the sentiment analysis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot histogram of the polarity values with break at 0
polarity_pd['compound'].hist(bins=[-1, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1],
             ax=ax,
             color="blue")

plt.title("Non Neutral Sentiments from most recent Tweets on Cycling")
plt.show()

Top Words Visualised in a WordCloud

In [None]:
# Ensure all words are stored in a list which can then be used to create a WordClod
print(type(df2_token_nostop))

# Confirmed words are in a list

In [None]:
# View list
print(df2_token_nostop)

In [None]:
# Reformat list for use to generate wordcloud
s = ''.join(str(x) for x in df2_token_nostop)

In [None]:
# Generate Word Cloud
wordcloud = (WordCloud(max_font_size=50, max_words=100, 
                       background_color="black").generate(s))

In [None]:
# Customise the colouring
bike = np.array(Image.open('bike.png'))

In [None]:
# Review: Display the WordCloud.
wordcloud.generate(s)
image_colors = ImageColorGenerator(bike)
plt.figure(figsize=(10, 10))
plt.axis('off') 
plt.imshow(wordcloud, interpolation="bilinear")

# Ask Kevin to prettify if possible

## Insights from Geo Data

Objective is to see where in the world Twitter users are most frequently discussing the topic of cycling

In [None]:
# Waiting for Norah to input

# Initial Insights from the count data