# CO2_LSE Employer Project - ThoughtWorks

# Data Analysis for London Data (Central London, Inner London, Biking Sites)

In [738]:
# Importing Numpy, Pandas, Seaborn and Matplotlib libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

## Data Exploration and Cleaning of Central, Inner London and Biking sites Datasets

In [739]:
# Central London Dataset
# Loading csv into a dataframe
cent_lon = pd.read_csv("Central London.csv")
inn_lon = pd.read_csv("Inner London.csv")
bike_site = pd.read_excel("Biking sites.xlsx")

### Central London Dataset

In [740]:
# Viewing the dataset

cent_lon.head()

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6.0,0.0,0.0,0.0,0.0,,,
1,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6.0,15.0,15.0,0.0,15.0,,,
2,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6.0,30.0,35.0,0.0,35.0,,,
3,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6.0,45.0,59.0,2.0,61.0,,,
4,2014 Q1 (January-March),2013-14 Q4,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7.0,0.0,73.0,0.0,73.0,,,


In [741]:
# Printing shape of central london dataset
print(cent_lon.shape)

(1048366, 17)


In [742]:
# Concise summary of Central London Dataset
cent_lon.info

<bound method DataFrame.info of         Survey wave (calendar quarter) Equivalent financial quarter   Site ID  \
0              2014 Q1 (January-March)                   2013-14 Q4  CENCY001   
1              2014 Q1 (January-March)                   2013-14 Q4  CENCY001   
2              2014 Q1 (January-March)                   2013-14 Q4  CENCY001   
3              2014 Q1 (January-March)                   2013-14 Q4  CENCY001   
4              2014 Q1 (January-March)                   2013-14 Q4  CENCY001   
...                                ...                          ...       ...   
1048361                            NaN                          NaN       NaN   
1048362                            NaN                          NaN       NaN   
1048363                            NaN                          NaN       NaN   
1048364                            NaN                          NaN       NaN   
1048365                            NaN                          NaN       NaN

In [743]:
# Total Number of Null values in 
cent_lon.isnull().sum()

Survey wave (calendar quarter)     290203
Equivalent financial quarter       290203
Site ID                            290203
Location                           290203
Survey date                        300359
Weather                            302037
Time                               290203
Period                             290203
Direction                          290203
Start hour                         290203
Start minute                       290203
Number of private cycles           290267
Number of cycle hire bikes         290267
Total cycles                       290203
Unnamed: 14                       1048366
Unnamed: 15                       1048366
Unnamed: 16                       1048366
dtype: int64

In [744]:
# Unique Values in Central London Dataset

for col in cent_lon:
  print(col,": ", cent_lon[col].nunique(), cent_lon[col].unique())

Survey wave (calendar quarter) :  31 ['2014 Q1 (January-March)' '2014 Q2 (April-June)'
 '2014 Q3 (July-September)' '2014 Q4 (October-December)'
 '2015 Q1 (January-March)' '2015 Q2 (April-June)'
 '2015 Q3 (July-September)' '2015 Q4 (October-December)'
 '2016 Q1 (January-March)' '2016 Q2 (April-June)'
 '2016 Q3 (July-September)' '2016 Q4 (October-December)'
 '2017 Q1 (January-March)' '2017 Q2 (April-June)'
 '2017 Q3 (July-September)' '2017 Q4 (October-December)'
 '2018 Q1 (January-March)' '2018 Q2 (April-June)'
 '2018 Q3 (July-September)' '2018 Q4 (October-December)'
 '2019 Q1 (January-March)' '2019 Q2 (April-June)'
 '2019 Q3 (July-September)' '2019 Q4 (October-December)'
 '2020 Q1 (January-March)' '2020 Q3 (July-September)'
 '2021 Q2 (April-June)' '2021 Q2 (April-June) ' '2021 Q3 (July-September)'
 '2021 Q3 (July-September) ' '2021 Q4 (October-December) ' nan]
Equivalent financial quarter :  29 ['2013-14 Q4' '2014-15 Q1' '2014-15 Q2' '2014-15 Q3' '2014-15 Q4'
 '2015-16 Q1' '2015-16 Q2' 

Time :  64 ['0600 - 0615' '0615 - 0630' '0630 - 0645' '0645 - 0700' '0700 - 0715'
 '0715 - 0730' '0730 - 0745' '0745 - 0800' '0800 - 0815' '0815 - 0830'
 '0830 - 0845' '0845 - 0900' '0900 - 0915' '0915 - 0930' '0930 - 0945'
 '0945 - 1000' '1000 - 1015' '1015 - 1030' '1030 - 1045' '1045 - 1100'
 '1100 - 1115' '1115 - 1130' '1130 - 1145' '1145 - 1200' '1200 - 1215'
 '1215 - 1230' '1230 - 1245' '1245 - 1300' '1300 - 1315' '1315 - 1330'
 '1330 - 1345' '1345 - 1400' '1400 - 1415' '1415 - 1430' '1430 - 1445'
 '1445 - 1500' '1500 - 1515' '1515 - 1530' '1530 - 1545' '1545 - 1600'
 '1600 - 1615' '1615 - 1630' '1630 - 1645' '1645 - 1700' '1700 - 1715'
 '1715 - 1730' '1730 - 1745' '1745 - 1800' '1800 - 1815' '1815 - 1830'
 '1830 - 1845' '1845 - 1900' '1900 - 1915' '1915 - 1930' '1930 - 1945'
 '1945 - 2000' '2000 - 2015' '2015 - 2030' '2030 - 2045' '2045 - 2100'
 '2100 - 2115' '2115 - 2130' '2130 - 2145' '2145 - 2200' nan]
Period :  5 ['Early Morning (06:00-07:00)' 'AM peak (07:00-10:00)'
 'Inter-

In [745]:
# Count of Unique values in each Column

for col in cent_lon:
  print(col,": ", cent_lon[col].nunique())

Survey wave (calendar quarter) :  31
Equivalent financial quarter :  29
Site ID :  210
Location :  205
Survey date :  1597
Weather :  283
Time :  64
Period :  5
Direction :  4
Start hour :  16
Start minute :  4
Number of private cycles :  435
Number of cycle hire bikes :  87
Total cycles :  479
Unnamed: 14 :  0
Unnamed: 15 :  0
Unnamed: 16 :  0


#### Cleaning Dataset

In [746]:
# Dropping unnecessary last 3 columns from Central London
cent_lon.drop(['Unnamed: 14','Unnamed: 15','Unnamed: 16'], axis=1, inplace=True)

#### Dropping Rows

In [747]:
# Viewing Rows that contain ALL Null Values

cent_lon[cent_lon['Location'].isnull()] #'Location' column is used randomly here, any column can be used here

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
758163,,,,,,,,,,,,,,
758164,,,,,,,,,,,,,,
758165,,,,,,,,,,,,,,
758166,,,,,,,,,,,,,,
758167,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048361,,,,,,,,,,,,,,
1048362,,,,,,,,,,,,,,
1048363,,,,,,,,,,,,,,
1048364,,,,,,,,,,,,,,


In [748]:
# Dropping Rows with all NUll values
cent_lon.drop(cent_lon.index[758163:1048366], inplace=True)

# Verifying if any rows with all Null values are left
cent_lon[cent_lon['Location'].isnull()]

Unnamed: 0,Survey wave (calendar quarter),Equivalent financial quarter,Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles


In [749]:
# Verifying if any Null value left in the entire dataset

cent_lon.isnull().sum()

Survey wave (calendar quarter)        0
Equivalent financial quarter          0
Site ID                               0
Location                              0
Survey date                       10156
Weather                           11834
Time                                  0
Period                                0
Direction                             0
Start hour                            0
Start minute                          0
Number of private cycles             64
Number of cycle hire bikes           64
Total cycles                          0
dtype: int64

In [750]:
# All the rows with any NaN values

is_NaN = cent_lon.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = cent_lon[row_has_NaN]

print(rows_with_NaN)

       Survey wave (calendar quarter) Equivalent financial quarter   Site ID  \
35264            2014 Q2 (April-June)                   2014-15 Q1  CENCY079   
35265            2014 Q2 (April-June)                   2014-15 Q1  CENCY079   
35266            2014 Q2 (April-June)                   2014-15 Q1  CENCY079   
35267            2014 Q2 (April-June)                   2014-15 Q1  CENCY079   
35268            2014 Q2 (April-June)                   2014-15 Q1  CENCY079   
...                               ...                          ...       ...   
718838       2021 Q3 (July-September)                   2021-22 Q2  CENCY112   
718839       2021 Q3 (July-September)                   2021-22 Q2  CENCY112   
718840       2021 Q3 (July-September)                   2021-22 Q2  CENCY112   
718841       2021 Q3 (July-September)                   2021-22 Q2  CENCY112   
718842       2021 Q3 (July-September)                   2021-22 Q2  CENCY112   

                Location    Survey date

In [751]:
# Replacing Null Values in columns with 0's
# Performing this step because dropping ALL null values Rows will miss out on Number of Cycles

# 'Survey date' column
cent_lon["Survey date"].fillna(0, inplace=True)

# 'Weather' column
cent_lon["Weather"].fillna(0, inplace=True)

# 'Number of private cycles'
cent_lon["Number of private cycles"].fillna(0, inplace=True)

# 'Number of cycle hire bikes'
cent_lon["Number of cycle hire bikes"].fillna(0, inplace=True)


In [752]:
# Viewing Dataframe to see if there are any Null values left
cent_lon.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 758163 entries, 0 to 758162
Data columns (total 14 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Survey wave (calendar quarter)  758163 non-null  object 
 1   Equivalent financial quarter    758163 non-null  object 
 2   Site ID                         758163 non-null  object 
 3   Location                        758163 non-null  object 
 4   Survey date                     758163 non-null  object 
 5   Weather                         758163 non-null  object 
 6   Time                            758163 non-null  object 
 7   Period                          758163 non-null  object 
 8   Direction                       758163 non-null  object 
 9   Start hour                      758163 non-null  float64
 10  Start minute                    758163 non-null  float64
 11  Number of private cycles        758163 non-null  float64
 12  Number of cycle 

In [753]:
# Dropping 'Equivalent financial quarter' column
cent_lon.drop(['Equivalent financial quarter'], axis=1, inplace=True)

#### Renaming Column names

In [754]:
# Renaming Columns for cent_lon using PEP-8 variable naming convention
cent_lon.rename(columns = {"Survey wave (calendar quarter)": "calendar_year",
                          "Site ID": "site_id",
                          "Location":"location",
                          "Survey date":"survey_date",
                          "Weather":"weather",
                          "Time":"time",
                          "Period":"period",
                          "Direction":"direction",
                          "Start hour":"start_hour",
                          "Start minute":"start_minute",
                          "Number of private cycles":"cycles_private",
                          "Number of cycle hire bikes":"cycles_hire",
                          "Total cycles":"cycles_total"},
                        inplace=True)

# Viewing the Dataframe
cent_lon.head()

Unnamed: 0,calendar_year,site_id,location,survey_date,weather,time,period,direction,start_hour,start_minute,cycles_private,cycles_hire,cycles_total
0,2014 Q1 (January-March),CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6.0,0.0,0.0,0.0,0.0
1,2014 Q1 (January-March),CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6.0,15.0,15.0,0.0,15.0
2,2014 Q1 (January-March),CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6.0,30.0,35.0,0.0,35.0
3,2014 Q1 (January-March),CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6.0,45.0,59.0,2.0,61.0
4,2014 Q1 (January-March),CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7.0,0.0,73.0,0.0,73.0


##### Extracting 'Year'  from calendar_year

In [755]:
# Segregating Start and End time  from 'time' column

# making a temporary data frame for segregation
temp_df_year = cent_lon["calendar_year"].str.split(" " , n = 1, expand = True)

# Adding new columns in main dataframe for Start and End time of survey
# start time Column

cent_lon["calendar_year"] = temp_df_year[0]

# No need for Quarter information as it is not present in other datasets 

In [756]:
cent_lon

Unnamed: 0,calendar_year,site_id,location,survey_date,weather,time,period,direction,start_hour,start_minute,cycles_private,cycles_hire,cycles_total
0,2014,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6.0,0.0,0.0,0.0,0.0
1,2014,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6.0,15.0,15.0,0.0,15.0
2,2014,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6.0,30.0,35.0,0.0,35.0
3,2014,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6.0,45.0,59.0,2.0,61.0
4,2014,CENCY001,Millbank (south of Thorney Street),"ven, 24/01/14",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7.0,0.0,73.0,0.0,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
758158,2021,CENCY702,Haymarket,"mar, 21/12/21",Dry,2045 - 2100,Evening (19:00-22:00),Southbound,20.0,45.0,22.0,1.0,23.0
758159,2021,CENCY702,Haymarket,"mar, 21/12/21",Dry,2100 - 2115,Evening (19:00-22:00),Southbound,21.0,0.0,20.0,0.0,20.0
758160,2021,CENCY702,Haymarket,"mar, 21/12/21",Dry,2115 - 2130,Evening (19:00-22:00),Southbound,21.0,15.0,16.0,1.0,17.0
758161,2021,CENCY702,Haymarket,"mar, 21/12/21",Dry,2130 - 2145,Evening (19:00-22:00),Southbound,21.0,30.0,10.0,1.0,11.0


##### Segregating Survey Date Column

In [757]:
# Segregating days and date from 'Survey_Date' column

# making a temporary data frame for segregation
temp_df_date = cent_lon["survey_date"].str.split("," , n = 1, expand = True)

# Adding new columns in main dataframe for days and dates of survey

# Day Column
cent_lon["day_survey"] = temp_df_date[0]

# Date Column
cent_lon["date_survey"] = temp_df_date[1]


In [758]:
# Dropping survey_Date column
cent_lon.drop(['survey_date'], axis=1, inplace=True)


##### Segregating Time Column

In [759]:
# Segregating Start and End time  from 'time' column

# making a temporary data frame for segregation
temp_df_time = cent_lon["time"].str.split("-" , n = 1, expand = True)

# Adding new columns in main dataframe for Start and End time of survey
# start time Column
cent_lon["survey_start_time"] = temp_df_time[0]

# end time Column
cent_lon["survey_end_time"] = temp_df_time[1]

In [760]:
# Dropping time column
cent_lon.drop(['time'], axis=1, inplace=True)

##### Segregating Period Column

In [761]:
# Segregating 'Part of the day' and 'Part of the day time'  from 'time' column

# making a temporary data frame for segregation
temp_df_period = cent_lon["period"].str.split("(" , n = 1, expand = True)

# Adding new columns in main dataframe for Start and End time of survey
# start time Column
cent_lon["day_part"] = temp_df_period[0]

# end time Column
cent_lon["day_part_timeslot"] = temp_df_period[1]

In [762]:
# Removing ')' from the end of timeslot
cent_lon["day_part_timeslot"] = cent_lon["day_part_timeslot"].str.replace(')', '')

In [763]:
cent_lon.drop(['period'], axis=1, inplace=True)

In [764]:
cent_lon.head()

Unnamed: 0,calendar_year,site_id,location,weather,direction,start_hour,start_minute,cycles_private,cycles_hire,cycles_total,day_survey,date_survey,survey_start_time,survey_end_time,day_part,day_part_timeslot
0,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,0.0,0.0,0.0,0.0,ven,24/01/14,600,615,Early Morning,06:00-07:00
1,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,15.0,15.0,0.0,15.0,ven,24/01/14,615,630,Early Morning,06:00-07:00
2,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,30.0,35.0,0.0,35.0,ven,24/01/14,630,645,Early Morning,06:00-07:00
3,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,45.0,59.0,2.0,61.0,ven,24/01/14,645,700,Early Morning,06:00-07:00
4,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,7.0,0.0,73.0,0.0,73.0,ven,24/01/14,700,715,AM peak,07:00-10:00


In [765]:
#### Replacing 'day_survey' data from French to English

# Friday
cent_lon.loc[cent_lon['day_survey'] == 'ven', 'day_survey'] = 'Friday'

# Monday
cent_lon.loc[cent_lon['day_survey'] == 'lun', 'day_survey'] = 'Monday'

# Tuesday
cent_lon.loc[cent_lon['day_survey'] == 'mar', 'day_survey'] = 'Tuesday'

# Wednesday
cent_lon.loc[cent_lon['day_survey'] == 'mer', 'day_survey'] = 'Wednesday'

# Thursday
cent_lon.loc[cent_lon['day_survey'] == 'jeu', 'day_survey'] = 'Thursday'

# Saturday
cent_lon.loc[cent_lon['day_survey'] == 'sam', 'day_survey'] = 'Saturday'

# Sunday
cent_lon.loc[cent_lon['day_survey'] == 'dim', 'day_survey'] = 'Sunday'

In [766]:
cent_lon.head()

Unnamed: 0,calendar_year,site_id,location,weather,direction,start_hour,start_minute,cycles_private,cycles_hire,cycles_total,day_survey,date_survey,survey_start_time,survey_end_time,day_part,day_part_timeslot
0,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,0.0,0.0,0.0,0.0,Friday,24/01/14,600,615,Early Morning,06:00-07:00
1,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,15.0,15.0,0.0,15.0,Friday,24/01/14,615,630,Early Morning,06:00-07:00
2,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,30.0,35.0,0.0,35.0,Friday,24/01/14,630,645,Early Morning,06:00-07:00
3,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,6.0,45.0,59.0,2.0,61.0,Friday,24/01/14,645,700,Early Morning,06:00-07:00
4,2014,CENCY001,Millbank (south of Thorney Street),Dry,Northbound,7.0,0.0,73.0,0.0,73.0,Friday,24/01/14,700,715,AM peak,07:00-10:00


#### Converting data type of 'survey_date' column to date format

In [767]:
# Importing Datetime module

from datetime import datetime

# converting 'survey date' column from object to date

cent_lon['date_survey'] = pd.to_datetime(cent_lon['date_survey'])

In [768]:
# Replace some obvious duplications in weather <Saurav's Code>
# Rain
cent_lon['weather'] = cent_lon['weather'].replace(['Wet','Cloudy/rain','Rain','Mix Wet/dry','Drizzle',
                                                          'Light Showers', 'Mizzle','Windy/rain','Showers',
                                                          'Wet/dry','Wet/damp','Shower','Drizzle/shower','Rainy',
                                                          'wet','Cloudy with showers','Generally overcast brief shower'
                                                          'Light Rain','Shower/dry','Spitting','Drizzle/cloudy',
                                                          'Dry/wet','Damp', 'Dry/drizzle','Dull/damp','Dry-wet',
                                                          'Wet/mix', 'Drizzle/wet','Wet/windy','Rain Shower',
                                                          'Intermittent Showers','Cloudy/drizzle','Rain/drizzle',
                                                          'Wet Road','Drizzle/dry','Drizzle/rain','Mixed Sunny + Rain',
                                                          'Wet/rain', 'V Light Drizzle', 'Rainy', 'W','Slight Drizzle',
                                                          'Rain Stopped', 'Stopped Raining','Wet Rain Stopped','Raining/wet',
                                                          'Showery','Overcast/rain','Rain/wet','Rain/showers','Showers/sunny',
                                                          'Drizzle/showers','Wet/stop Raining','Drizzle Rain','Drizzle Wet',
                                                          'Damp/sun','Raining','Dry + Wet','Showers/cloudy','Cloudy/showers',
                                                          'Getting Wet','Wet Road:sun','Dry But Wet Road','Drizze',
                                                          'wet','Wettish','Light Rain','S.wet','S/w','Cold/rain',
                                                           'Slightly Wet','Road Wet','Light Shower','Rain Damp','Wet Damp',
                                                              'Wet - Dry','Dry - Wet','Rain Dry','Dry - Rain','Damp - Rain',
                                                              'Wet/ Dry','S. Wet','Cloudy/ Rain','Windy/ Rain','Wet T',
                                                              'Some Showers','Rains','Sunny/rainy','Wetr','Showers Mix',
                                                              'Rain/dry','Rain/cloudy','Shower/wet','Wetter',
                                                              'Heavy Rain','Heavy Shower','Heavy Shr','Down Pour',
                                                           'Deluge','Heavy Showers', 'Shower','Rain Heavy Showers',
                                                           'Intermitent Showers','Thunder Lightening Rain!','Very Wet',
                                                           'V.wet','Heavy Downpour/rain','Showery','Wet Heavy Rain',
                                                           'Wet (heavy Rain)','Wet (shower)','Blustery','V. Wet',
                                                              'Rain & Thunder','Rain-heavy','H Rain','Wert','(rain After)',
                                                              'Cloud/rain','Really Wet','Periods Of Rain Quite Windy',
                                                              'Steady Rain'],'Rain')

# Good
cent_lon['weather'] = cent_lon['weather'].replace(['Sunny','Cloudy Sunny','Sun Setting','Good','Dry/sunny',
                                                          'Fine + Dry', 'Fine + Hot','Bright','Dry Hot!!',
                                                          'Dry & Sunny','Dry & Sun','Fine & Dry','Good/dry','Sun',
                                                          'Sunny Dry','Clear and Bright', 'Fine', 'Dry/good', 
                                                          'Fine/dry', 'Warm + Dry','Dry','Dry                         9',
                                                          'Sunny','Cloudy/sunny','Druy','Dry/hot','Dry Warm',
                                                          'Dry/sun','Dryish','Clear And Dry','Clear and Dry','Dry, Warm',
                                                          'Dry, Sunny, Warm','Cloudy with Clear Intervals','Clear and Warm',
                                                          'Dry But Misty','Sunny & Warm All Day','Clear','Dry + Sunny',
                                                          'Sunny/dry','Dr Ry','Dry Y','D','Warm/dry','Bright/dry','Dry Sunny',
                                                          'Fair','Dry/sun','Kdry','Fine Windy',
                                                               'Cloudy','Sunny Overcast Sunny','Sunny/cloudy',
                                                               'Cloudy/rain/sunny','Cloudy + Sunny','Sunny + Cloudy',
                                                               'Cloudy/sunny','Bright + Cloudy','Cloudy/dry',
                                                               'Partly Sunny','Dull','Dry & Mild','Cloud','Overcast',
                                                               'Mild','Overcast (No Rain)','Cloudy bright intervals',
                                                               'Generally overcast','Cloudy with clear spells',
                                                               'Sunny Overcast','Dry','Dry/mild', 'Clear',
                                                               'Cloudy and Dry','Partly cloudy but dry',
                                                          'Partly cloudy and dry','Cloudy but dry','Partly cloudy and Dry',
                                                          'Sun/Cloudy','Clouds & Sunny','Sun/clouds','Cloudy & Sunny',
                                                          'Sun & Clouds','Cloudy Dry','Cloud/sun','Mixed','Sun/cloud',
                                                           'Sunny/cloudy','Cloudy Sun','Cloudy/sun','Dry/cloudy',
                                                           'Sun/cloudy','Overcast/dry','Cloud','Dull','Dry/overcast',
                                                          'Dark/cloudy','Cloudy/dry','Cloudy','Hazy','Partly Cloudy',
                                                               'Drty','Dry (windy)','Fine (windy)','Sunny Cloudy',
                                                              'Dry Dark','Dark','Dry Mon','Dry Wed','Dry Thu','Dry Fri',
                                                              'Sun/rain','Thunder','Cloudy','Sunny Overcast Sunny',
                                                               'Sunny/cloudy','Cloudy/rain/sunny',
                                                           'Cloudy + Sunny','Sunny + Cloudy', 'Cloudy/sunny',
                                                           'Bright + Cloudy','Cloudy/dry','Partly Sunny','Dull','Dry & Mild',
                                                           'Cloud','Overcast','Mild','Overcast (No Rain)',
                                                          'Cloudy bright intervals','Generally overcast',
                                                           'Cloudy with clear spells','Sunny Overcast','Dry',
                                                           'Dry/mild', 'Clear','Cloudy and Dry','Partly cloudy but dry',
                                                          'Partly cloudy and dry','Cloudy but dry','Partly cloudy and Dry',
                                                          'Sun/Cloudy','Clouds & Sunny','Sun/clouds','Cloudy & Sunny',
                                                          'Sun & Clouds','Cloudy Dry','Cloud/sun','Mixed','Sun/cloud',
                                                           'Sunny/cloudy','Cloudy Sun','Cloudy/sun','Dry/cloudy',
                                                           'Sun/cloudy','Overcast/dry','Cloud','Dull','Dry/overcast',
                                                          'Dark/cloudy','Cloudy/dry','Cloudy','Hazy','Partly Cloudy',
                                                               'Drty','Dry (windy)','Fine (windy)','Sunny Cloudy',
                                                              'Dry Dark','Dark','Dry Mon','Dry Wed','Dry Thu','Dry Fri',
                                                              'Sun/rain','Thunder','Ddry','Dy','Dry/sunny/cold','Fine Cold',
                                                              'Cold Dry','Dry & Cold','Dry And Fine','Dry And Sunny',
                                                              'Dry And Warm','Fine And Dry','Warm + Sunny','Warm And Humid',
                                                              'Warm And Windy','Overcast And Dull','Cloudy And Warm',
                                                              'Sunny Periods And Warm','Dry And Windy','Dry And Very Windy',
                                                              'Warm Sunny And Windy','Hot And Humid','Mild And Sunny',
                                                               'Warm And Overcast','Sunny & Windy','Windy/cloudy',
                                                              'Dry/gusty','Coldish','Windy/dry','Dry But A Bit Windy',
                                                               'Sunny Cold','Cold At First Then Warm/sunny',
                                                              'Warm & Sunny Chilly Later','Fine + Dry Chilly At First',
                                                               'Fine & Sunny','dry','A Bit Chilly At First',
                                                               'Warm With A Slight Wind','Cold Then Dry And Windy',
                                                               'Dry And Overcast','Warm + Sunny Cloudy + Windy',
                                                              'Dry 3/4 Dry','Sunny Until Evening But Windy',
                                                               'Winds Rather Chilly','Warm','Sunny But Very Windy',
                                                               'Now Starts To Get Chilly'],'Good')


# Light Rain
cent_lon['weather'] = cent_lon['weather'].replace(['Wet/dry','Intermittent Light Drizzle','Light Rain',
                                                           'Lt Rain','Drizzle','Intermittent Drizzle', 'Damp','Getting Dry',
                                                           'Dry & Wet','Slight Drizzle/dry','Wet Intermittently',
                                                               'Light Rain','V Light Rain','Dry Wet Road','Dry A.m Wet P.m',
                                                               'Mist','Road Drying Sun Out','Wetish','Light Shrs',
                                                              'Fine Drizzle','V Light Shrs','L/rain','Rain Stopped-dry',
                                                              'V Lt Rain','V.light Rain','Dry (+brief Speels Of Drizzle',
                                                              'Wet (spitting)','Drizzly Rain','Almost Dry','Damp & Drizzly',
                                                              'Dry Road Wet With Leaves','Wet Drizzle','No Rain Wet Roads',
                                                              'Dry But Wet Roads','Very Light Rain','Light Drizzle',
                                                              'Dry/wet Road Surface','V Light Showers','V. Light Rain',
                                                              'Wet/cloudy','Wet/sunny','Dry Road Still Wet',
                                                              '2 Snowflakes Otherwise Dry','Wet-dry','Dry/drizzly',
                                                              'Wet/light Showers','Wet/drizzle','Wet And Windy',
                                                              'Drizzling','Drizzle Damp','Windy Showery','Wet + Dry',
                                                              'V.light Drizzle','Very Light Drizzle','Drying Up','Wet Again',
                                                              'Cold Sunny Rain','Wet First Then Dry','Wetr First Then Dry',
                                                              'Dry With Intermitent Rain','(drizzle)','Damp/misty/wet',
                                                              'Dry But Rain Threatening','Slight Drizzle Till End',
                                                              'Damp/misty','Cold & Dry Early Rain Later',
                                                              'Wet ','Windy/drizzle','Intermitent Light Showers',
                                                              'Intermitent Light Rain','A Few Rain Showers','Drizzly',
                                                              'Rain Looking Likely','A Few Drops Of Rain'],'Light Rain')

# Dangerous weather
cent_lon['weather'] = cent_lon['weather'].replace(['Heavy Rain','Dry/wet Road','Dry With Wet Road',
                                                           'Hot','Snow!','Snow', 'Sleet','Very Hot',
                                                           'Dry (road Wet)','Dry, Sunny, Hot','Very Heavy Rain',
                                                           'Intermittent Heavy Showers','Very Hot/dry','Hot/dry',
                                                           'Storm','Heavy Rain High Winds','V Wet','Rain Heavy',
                                                          'Sunny (hot!)','Heavy Thunder','Overcast/rain Heavy Showers',
                                                          'Too Cold','High Wind','Very Windy','Dry & Very Windy',
                                                              'Very Hot Dry','Wet/windy','Wet/v.windy','Wet Hail',
                                                               'Rain/hail','Foggy Wet',
                                                           'Wet Heavy Wind', 'Wet-windy','Hailstones',
                                                           'Short Hail Shower','Rain/sleet','Hail Stone',
                                                          'Hail','Showers/hailstone','Rain/hailstone','Cold/ Rain',
                                                              'Foggy','Wet & Windy','Wet + Windy','Rain/wind',
                                                              'Wet (windy)','Occasional Lt Snow Shrs',
                                                              'Wet And Very Windy','Dry Chill','Dry/cold','Dry Cold',
                                                               'Cold/sunny','Cold/cloudy',
                                                           'Dry Very Windy', 'Dry/windy','Windy','Cold','Cloudy/windy',
                                                           'Windy + Sunny','Sunsetting + Windy','Dark Cloudy',
                                                           'Dry V. Cold!','Very Cool','Dry & Windy',
                                                          'Dry but Cold or Wind','Dry/v. Windy','Dry Windy',
                                                          'Windy At First Then Sunny','Windy Dry','Cold Windy Dry',
                                                              'Cold/dry','Some Heavy Showers','Very Cold/dry',
                                                              'Foggy/v Cold','Hail Shower','Snowing','Wet/ Snowing',
                                                              'Heavy Snow','Dry/very Windy','Very Windy & Cold',
                                                              'Wet Light Hailstone','Heavy Showers Throughout Day',
                                                              'High Winds & Spits Of Rain','Fine V Cold',
                                                              'Dry (frost & Fog)','V Cold Showers','Cold/showery',
                                                              'Light Showers Inc Some Hail','Cloudy/hail','Cold Wind',
                                                              'Hot & Sunny','Hot And Sunny','Dry/windy/strong Wind',
                                                              'Hot + Humid','Very Cold Sunny But Windy'],'Dangerous weather')

# Consolidating 'Unknown'
cent_lon['weather'] = cent_lon['weather'].replace(['School Out','N/a','Unknown','Dark Sunny',
                                                              'Wed','Warm & Sunny But Windy & Cold'],'Unknown')

# Transforming Nan Values into Unknown
# Replacing nan with 'Unknown'
cent_lon.weather = cent_lon.weather.fillna('Unknown')

# Consolidating "Dry Dark"
cent_lon['weather'] = cent_lon['weather'].replace(['Dry Dark','Dry/dark','Dark/dry',
                                                           'Dark Dry'],'Unknown')

In [769]:
cent_lon.head()

Unnamed: 0,calendar_year,site_id,location,weather,direction,start_hour,start_minute,cycles_private,cycles_hire,cycles_total,day_survey,date_survey,survey_start_time,survey_end_time,day_part,day_part_timeslot
0,2014,CENCY001,Millbank (south of Thorney Street),Good,Northbound,6.0,0.0,0.0,0.0,0.0,Friday,2014-01-24,600,615,Early Morning,06:00-07:00
1,2014,CENCY001,Millbank (south of Thorney Street),Good,Northbound,6.0,15.0,15.0,0.0,15.0,Friday,2014-01-24,615,630,Early Morning,06:00-07:00
2,2014,CENCY001,Millbank (south of Thorney Street),Good,Northbound,6.0,30.0,35.0,0.0,35.0,Friday,2014-01-24,630,645,Early Morning,06:00-07:00
3,2014,CENCY001,Millbank (south of Thorney Street),Good,Northbound,6.0,45.0,59.0,2.0,61.0,Friday,2014-01-24,645,700,Early Morning,06:00-07:00
4,2014,CENCY001,Millbank (south of Thorney Street),Good,Northbound,7.0,0.0,73.0,0.0,73.0,Friday,2014-01-24,700,715,AM peak,07:00-10:00


### Inner London Dataset

In [770]:
# Printing shape of Inner london dataset
print(inn_lon.shape)

(615168, 13)


In [771]:
# Concise summary of Inner London Dataset
cent_lon.info

<bound method DataFrame.info of        calendar_year   site_id                            location weather  \
0               2014  CENCY001  Millbank (south of Thorney Street)    Good   
1               2014  CENCY001  Millbank (south of Thorney Street)    Good   
2               2014  CENCY001  Millbank (south of Thorney Street)    Good   
3               2014  CENCY001  Millbank (south of Thorney Street)    Good   
4               2014  CENCY001  Millbank (south of Thorney Street)    Good   
...              ...       ...                                 ...     ...   
758158          2021  CENCY702                           Haymarket    Good   
758159          2021  CENCY702                           Haymarket    Good   
758160          2021  CENCY702                           Haymarket    Good   
758161          2021  CENCY702                           Haymarket    Good   
758162          2021  CENCY702                           Haymarket    Good   

         direction  start_hour 

In [772]:
inn_lon.head()

Unnamed: 0,Survey wave (year),Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
0,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,6.0,0.0,1.0,0.0,1.0
1,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,6.0,15.0,2.0,0.0,2.0
2,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,6.0,30.0,2.0,0.0,2.0
3,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,6.0,45.0,4.0,0.0,4.0
4,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,7.0,0.0,4.0,0.0,4.0


In [773]:
inn_lon.dtypes

Survey wave (year)            float64
Site ID                        object
Location                       object
Survey date                    object
Weather                        object
Time                           object
Period                         object
Direction                      object
Start hour                    float64
Start minute                  float64
Number of private cycles      float64
Number of cycle hire bikes    float64
Total cycles                  float64
dtype: object

In [774]:
# Finding number of missing values in Inner London
inn_lon.isnull().sum()

Survey wave (year)            91392
Site ID                       91392
Location                      91392
Survey date                   94144
Weather                       96066
Time                          91398
Period                        91398
Direction                     91392
Start hour                    91398
Start minute                  91398
Number of private cycles      91392
Number of cycle hire bikes    91392
Total cycles                  91392
dtype: int64

In [775]:
# Viewing Rows that contain Null Values
inn_lon[inn_lon['Location'].isnull()] #'Location' column is used randomly here, any column can be used here

Unnamed: 0,Survey wave (year),Site ID,Location,Survey date,Weather,Time,Period,Direction,Start hour,Start minute,Number of private cycles,Number of cycle hire bikes,Total cycles
523776,,,,,,,,,,,,,
523777,,,,,,,,,,,,,
523778,,,,,,,,,,,,,
523779,,,,,,,,,,,,,
523780,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
615163,,,,,,,,,,,,,
615164,,,,,,,,,,,,,
615165,,,,,,,,,,,,,
615166,,,,,,,,,,,,,


In [776]:
# Dropping Rows with all NUll values
inn_lon.drop(inn_lon.index[523776:615168], inplace=True)

In [777]:
inn_lon.isnull().sum()

Survey wave (year)               0
Site ID                          0
Location                         0
Survey date                   2752
Weather                       4674
Time                             6
Period                           6
Direction                        0
Start hour                       6
Start minute                     6
Number of private cycles         0
Number of cycle hire bikes       0
Total cycles                     0
dtype: int64

In [778]:
# Viewing Dataframe to see if there are any Null values left
inn_lon.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 523776 entries, 0 to 523775
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Survey wave (year)          523776 non-null  float64
 1   Site ID                     523776 non-null  object 
 2   Location                    523776 non-null  object 
 3   Survey date                 521024 non-null  object 
 4   Weather                     519102 non-null  object 
 5   Time                        523770 non-null  object 
 6   Period                      523770 non-null  object 
 7   Direction                   523776 non-null  object 
 8   Start hour                  523770 non-null  float64
 9   Start minute                523770 non-null  float64
 10  Number of private cycles    523776 non-null  float64
 11  Number of cycle hire bikes  523776 non-null  float64
 12  Total cycles                523776 non-null  float64
dtypes: float64(6),

In [779]:
# Dropping redundant columns

inn_lon = inn_lon.drop(['Start hour', 'Start minute'], axis=1) 

#### Renaming Column names

In [780]:
# Renaming Columns for inn_lon according to PEP-8 variable naming convention
inn_lon.rename(columns = {"Survey wave (year)": "calendar_year",
                          "Site ID": "site_id",
                          "Location":"location",
                          "Survey date":"survey_date",
                          "Weather":"weather",
                          "Time":"time",
                          "Period":"period",
                          "Direction":"direction",
                          "Number of private cycles":"cycles_private",
                          "Number of cycle hire bikes":"cycles_hire",
                          "Total cycles":"cycles_total"},
                        inplace=True)

# Viewing the Dataframe
inn_lon

Unnamed: 0,calendar_year,site_id,location,survey_date,weather,time,period,direction,cycles_private,cycles_hire,cycles_total
0,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0600 - 0615,Early Morning (06:00-07:00),Northbound,1.0,0.0,1.0
1,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0615 - 0630,Early Morning (06:00-07:00),Northbound,2.0,0.0,2.0
2,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0630 - 0645,Early Morning (06:00-07:00),Northbound,2.0,0.0,2.0
3,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0645 - 0700,Early Morning (06:00-07:00),Northbound,4.0,0.0,4.0
4,2015.0,INNCY001,Grove Road,"mer, 20/05/15",Dry,0700 - 0715,AM peak (07:00-10:00),Northbound,4.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
523771,2021.0,INNCY597,Augustus Road,"mer, 26/05/21",Dry,2045 - 2100,Evening (19:00-22:00),Westbound,3.0,0.0,3.0
523772,2021.0,INNCY597,Augustus Road,"mer, 26/05/21",Dry,2100 - 2115,Evening (19:00-22:00),Westbound,2.0,0.0,2.0
523773,2021.0,INNCY597,Augustus Road,"mer, 26/05/21",Dry,2115 - 2130,Evening (19:00-22:00),Westbound,2.0,0.0,2.0
523774,2021.0,INNCY597,Augustus Road,"mer, 26/05/21",Dry,2130 - 2145,Evening (19:00-22:00),Westbound,2.0,0.0,2.0


In [781]:
inn_lon.isnull().sum()

calendar_year        0
site_id              0
location             0
survey_date       2752
weather           4674
time                 6
period               6
direction            0
cycles_private       0
cycles_hire          0
cycles_total         0
dtype: int64

In [782]:
# Replacing Null Values in columns with 0's
# each column has been seperately addresses 
# as entire dataframe command (df.fillna(0)) was not working

# 'Survey date' column
inn_lon["survey_date"].fillna(0, inplace=True)

# 'Weather' column
inn_lon["weather"].fillna(0, inplace=True)

# 'time' column
inn_lon["time"].fillna(0, inplace=True)

# 'period' column
inn_lon["period"].fillna(0, inplace=True)

#### Segregating Survey Date Column

In [784]:
# Segregating days and date from 'Survey_Date' column

# making a temporary data frame for segregation
temp_df_date_1 = inn_lon["survey_date"].str.split("," , n = 1, expand = True)

# Adding new columns in main dataframe for days and dates of survey

# Day Column
inn_lon["day_survey"] = temp_df_date_1[0]

# Date Column
inn_lon["date_survey"] = temp_df_date_1[1]

In [787]:
# Dropping 'survey_date' column
inn_lon.drop(['survey_date'], axis=1, inplace=True)

In [796]:
# Importing Datetime module

from datetime import datetime

# converting 'survey date' column from object to date

inn_lon_V2['date_survey'] = pd.to_datetime(inn_lon_V2['date_survey'])

In [788]:
#### Replacing 'day_survey' data from French to English

# Friday
inn_lon.loc[inn_lon['day_survey'] == 'ven', 'day_survey'] = 'Friday'

# Monday
inn_lon.loc[inn_lon['day_survey'] == 'lun', 'day_survey'] = 'Monday'

# Tuesday
inn_lon.loc[inn_lon['day_survey'] == 'mar', 'day_survey'] = 'Tuesday'

# Wednesday
inn_lon.loc[inn_lon['day_survey'] == 'mer', 'day_survey'] = 'Wednesday'

# Thursday
inn_lon.loc[inn_lon['day_survey'] == 'jeu', 'day_survey'] = 'Thursday'

# Saturday
inn_lon.loc[inn_lon['day_survey'] == 'sam', 'day_survey'] = 'Saturday'

# Sunday
inn_lon.loc[inn_lon['day_survey'] == 'dim', 'day_survey'] = 'Sunday'

In [790]:
# Changing data types of columns from float to int

inn_lon['calendar_year'] = inn_lon['calendar_year'].astype(int)
inn_lon['cycles_private'] = inn_lon['cycles_private'].astype(int)
inn_lon['cycles_hire'] = inn_lon['cycles_hire'].astype(int)
inn_lon['cycles_total'] = inn_lon['cycles_total'].astype(int)

#### Segregating Time Column

In [791]:
# Segregating Start and End time  from 'time' column

# making a temporary data frame for segregation
temp_df_time_1 = inn_lon["time"].str.split("-" , n = 1, expand = True)

# Adding new columns in main dataframe for Start and End time of survey
# start time Column
inn_lon["survey_start_time"] = temp_df_time_1[0]

# end time Column
inn_lon["survey_end_time"] = temp_df_time_1[1]

#### Segregating Period Column

In [792]:
# Segregating 'Part of the day' and 'Part of the day time'  from 'time' column

# making a temporary data frame for segregation
temp_df_period_1 = inn_lon["period"].str.split("(" , n = 1, expand = True)

# Adding new columns in main dataframe for Start and End time of survey
# start time Column
inn_lon["day_part"] = temp_df_period_1[0]

# end time Column
inn_lon["day_part_timeslot"] = temp_df_period_1[1]

In [794]:
inn_lon["day_part_timeslot"] = inn_lon["day_part_timeslot"].str.replace(")", "", regex=True)

In [None]:
# Dropping Original combine columns
# Cpying into another dataframe (V2) in case need to revert
inn_lon_V2 = inn_lon.drop(['period', 'survey_date', 'time'], axis=1) 


#### Converting data types to DATE TIME format

In [797]:
# Replace some obvious duplications in weather <SAURAV's code>

# Rain
inn_lon['weather'] = inn_lon['weather'].replace(['Wet','Showers','Rain','Cloudy + Rain','Rain & Cloudy',
                                                          'Raining', 'Rain/cloudy','Wet/thunder','Light Showers',
                                                          'Rain/showers','W','Wey','Drizzle/shower','Rainy',
                                                          'wet','Cloudy with showers','Generally overcast brief shower'],'Rain')

# Sunny
inn_lon['weather'] = inn_lon['weather'].replace(['Sunny','Cloudy Sunny','Sun Setting','Good','Dry/sunny',
                                                          'Fine + Dry', 'Fine + Hot','Bright','Dry Hot!!',
                                                          'Dry & Sunny','Dry & Sun','Fine & Dry','Good/dry','Sun',
                                                          'Sunny Dry','Clear and Bright', 'Fine', 'Dry/good', 
                                                          'Fine/dry', 'Warm + Dry'],'Sunny')

# Overcast
inn_lon['weather'] = inn_lon['weather'].replace(['Cloudy','Sunny Overcast Sunny','Sunny/cloudy','Cloudy/rain/sunny',
                                                           'Cloudy + Sunny','Sunny + Cloudy', 'Cloudy/sunny',
                                                           'Bright + Cloudy','Cloudy/dry','Partly Sunny','Dull','Dry & Mild',
                                                           'Cloud','Overcast','Mild','Overcast (No Rain)',
                                                          'Cloudy bright intervals','Generally overcast',
                                                           'Cloudy with clear spells','Sunny Overcast','Dry',
                                                           'Dry/mild', 'Clear'],'Overcast')

# Heavy Rain
inn_lon['weather'] = inn_lon['weather'].replace(['Heavy Rain','Heavy Shower','Heavy Shr','Down Pour',
                                                           'Deluge','Heavy Showers', 'Shower','Rain Heavy Showers',
                                                           'Intermitent Showers','Thunder Lightening Rain!','Very Wet',
                                                           'V.wet','Heavy Downpour/rain','Showery','Wet Heavy Rain',
                                                           'Wet (heavy Rain)','Cloudy with clear spells','Sunny Overcast'],'Heavy Rain')

# Dry but Cold or Windy
inn_lon['weather'] = inn_lon['weather'].replace(['Dry Chill','Dry/cold','Dry Cold','Cold/sunny','Cold/cloudy',
                                                           'Dry Very Windy', 'Dry/windy','Windy','Cold','Cloudy/windy',
                                                           'Windy + Sunny','Sunsetting + Windy','Dark Cloudy',
                                                           'Dry V. Cold!','Very Cool'],'Dry but Cold or Windy')
# Wet & Windy or Cold
inn_lon['weather'] = inn_lon['weather'].replace(['Wet/windy','Wet/v.windy','Wet Hail','Rain/hail','Foggy Wet',
                                                           'Wet Heavy Wind', 'Wet-windy','Hailstones',
                                                           'Short Hail Shower'],'Wet and Windy or Cold')


# Light Rain
inn_lon['weather'] = inn_lon['weather'].replace(['Wet/dry','Intermittent Light Drizzle','Light Rain',
                                                           'Lt Rain','Drizzle','Intermittent Drizzle', 'Damp','Getting Dry',
                                                           'Dry & Wet'],'Light Rain')

# Dangerous weather
inn_lon['weather'] = inn_lon['weather'].replace(['Heavy Rain','Dry/wet Road','Dry With Wet Road',
                                                           'Hot','Snow!','Snow', 'Sleet','Very Hot',
                                                           'Dry (road Wet)'],'Dangerous weather')

# Replacing nan with 'Unknown'
inn_lon.weather = inn_lon.weather.fillna('Unknown')

# Consolidating "Dry Dark"
inn_lon['weather'] = inn_lon['weather'].replace(['Dry Dark','Dry/dark','Dark/dry',
                                                           'Dark Dry'],'Dry Dark')


#### Exploratory Data Analysis of Biking Sites

In [798]:
# Biking sites Dataset
# Loading csv into a dataframe
bike_site = pd.read_excel("Biking sites.xlsx")

# Displaying first 5 Rows of the dataset
bike_site.head()

Unnamed: 0,UnqID,ProgID,SurveyDescription,Easting,Northing,Location,Borough,Functional cycling area
0,CENCY001,CENCY,Central area cycle surveys,530251.49,178742.45,Millbank (south of Thorney Street),Westminster,Central
1,CENCY002,CENCY,Central area cycle surveys,533362.68,181824.45,Bishopsgate,City of London,Central
2,CENCY003,CENCY,Central area cycle surveys,532334.06,180520.37,Southwark Bridge,Southwark,Central
3,CENCY004,CENCY,Central area cycle surveys,532052.5,179677.64,Southwark Bridge Road,Southwark,Central
4,CENCY005,CENCY,Central area cycle surveys,533031.59,180213.46,Tooley Street,Southwark,Central


In [None]:
# Viewing the shape of the dataframe
print(bike_site.shape)

In [None]:
# Viewing data types of the dataframe
bike_site.dtypes

In [None]:
# FInding Null values
bike_site.isnull()

In [799]:
bike_site.isnull().sum()

UnqID                      0
ProgID                     0
SurveyDescription          0
Easting                    0
Northing                   0
Location                   0
Borough                    0
Functional cycling area    2
dtype: int64

In [803]:
# Filling NaNs with unknowns 
bike_site['Functional cycling area'] = bike_site['Functional cycling area'].fillna('Unknown')

In [808]:
# Changing Column names of Biking site
bike_site.rename(columns = {"UnqID": "site_id",
                          "ProgID": "prog_id",
                          "SurveyDescription":"survey_description",
                          "Easting":"easting",
                          "Northing":"northing",
                          "Location":"location",
                          "Borough":"borough",
                          "Functional cycling area":"func_cycle_area"
                         },inplace=True)

In [809]:
bike_site

Unnamed: 0,site_id,prog_id,survey_description,easting,northing,location,borough,func_cycle_area
0,CENCY001,CENCY,Central area cycle surveys,530251.49,178742.45,Millbank (south of Thorney Street),Westminster,Central
1,CENCY002,CENCY,Central area cycle surveys,533362.68,181824.45,Bishopsgate,City of London,Central
2,CENCY003,CENCY,Central area cycle surveys,532334.06,180520.37,Southwark Bridge,Southwark,Central
3,CENCY004,CENCY,Central area cycle surveys,532052.50,179677.64,Southwark Bridge Road,Southwark,Central
4,CENCY005,CENCY,Central area cycle surveys,533031.59,180213.46,Tooley Street,Southwark,Central
...,...,...,...,...,...,...,...,...
2018,QWPCY284,QWPCY,Quietway cycle surveys,516507.00,188467.00,Elmwood Avenue,Harrow,Outer
2019,QWPCY285,QWPCY,Quietway cycle surveys,515655.00,189672.00,Peel Road,Harrow,Outer
2020,QWPCY286,QWPCY,Quietway cycle surveys,515007.00,190213.00,Whitefriars Avenue,Harrow,Outer
2021,QWPCY287,QWPCY,Quietway cycle surveys,535456.00,186284.00,Chatsworth Road,Hackney,Inner


In [813]:
inn_lon_temp = inn_lon

pd.merge(inn_lon_temp, bike_site, on="site_id", how="left")

Unnamed: 0,calendar_year,site_id,location_x,weather,time,period,direction,cycles_private,cycles_hire,cycles_total,...,survey_end_time,day_part,day_part_timeslot,prog_id,survey_description,easting,northing,location_y,borough,func_cycle_area
0,2015,INNCY001,Grove Road,Overcast,0600 - 0615,Early Morning (06:00-07:00),Northbound,1,0,1,...,0615,Early Morning,06:00-07:00,INNCY,Inner area cycle surveys,536005.234595,183224.931664,Grove Road,Tower Hamlets,Inner
1,2015,INNCY001,Grove Road,Overcast,0615 - 0630,Early Morning (06:00-07:00),Northbound,2,0,2,...,0630,Early Morning,06:00-07:00,INNCY,Inner area cycle surveys,536005.234595,183224.931664,Grove Road,Tower Hamlets,Inner
2,2015,INNCY001,Grove Road,Overcast,0630 - 0645,Early Morning (06:00-07:00),Northbound,2,0,2,...,0645,Early Morning,06:00-07:00,INNCY,Inner area cycle surveys,536005.234595,183224.931664,Grove Road,Tower Hamlets,Inner
3,2015,INNCY001,Grove Road,Overcast,0645 - 0700,Early Morning (06:00-07:00),Northbound,4,0,4,...,0700,Early Morning,06:00-07:00,INNCY,Inner area cycle surveys,536005.234595,183224.931664,Grove Road,Tower Hamlets,Inner
4,2015,INNCY001,Grove Road,Overcast,0700 - 0715,AM peak (07:00-10:00),Northbound,4,0,4,...,0715,AM peak,07:00-10:00,INNCY,Inner area cycle surveys,536005.234595,183224.931664,Grove Road,Tower Hamlets,Inner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523771,2021,INNCY597,Augustus Road,Overcast,2045 - 2100,Evening (19:00-22:00),Westbound,3,0,3,...,2100,Evening,19:00-22:00,INNCY,Inner area cycle surveys,524704.182822,173281.028077,Augustus Road,Wandsworth,Inner
523772,2021,INNCY597,Augustus Road,Overcast,2100 - 2115,Evening (19:00-22:00),Westbound,2,0,2,...,2115,Evening,19:00-22:00,INNCY,Inner area cycle surveys,524704.182822,173281.028077,Augustus Road,Wandsworth,Inner
523773,2021,INNCY597,Augustus Road,Overcast,2115 - 2130,Evening (19:00-22:00),Westbound,2,0,2,...,2130,Evening,19:00-22:00,INNCY,Inner area cycle surveys,524704.182822,173281.028077,Augustus Road,Wandsworth,Inner
523774,2021,INNCY597,Augustus Road,Overcast,2130 - 2145,Evening (19:00-22:00),Westbound,2,0,2,...,2145,Evening,19:00-22:00,INNCY,Inner area cycle surveys,524704.182822,173281.028077,Augustus Road,Wandsworth,Inner
