In [None]:
import pandas as pd

To begin, we read the Excel file to see all the sheets that are in our file and identify which would be relevant to answer the business questions.

In [None]:
# Print all sheet names
all_sheets = pd.ExcelFile('data/business-demographics.xlsx')
print(all_sheets.sheet_names)

# Survival Rates

To answer our first two business questions regarding survival rates, we have to analyse past survival rates of businesses created in different years. Thus, we will analyse the sheets with the survival rates of businesses.

In [None]:
# Read the sheets on survival rates in a dictionary
survivalrates = {}
    
for i in range(2002, 2019):
    survivalrates[str(i)] = pd.read_excel('data/business-demographics.xlsx', sheet_name= str(i) + ' Survival Rates')

# Print 2002 Survival Rates
print(survivalrates['2002'].head(5))

# Print the number of rows and columns in the dataframe
print(survivalrates['2002'].shape)

At first glance, we identify several problems with the dataset and we can assume that these problems exist in all other sheets on survival rates.  

Firstly, the column names are mostly unnamed, so we know the first row of the dataset contains empty values. Thus, we should re-read the Excel file and skip the first row.


In [None]:
# Read the sheets and skip a row
for i in range(2002, 2019):
    survivalrates[str(i)] = pd.read_excel('data/business-demographics.xlsx', sheet_name= str(i) + ' Survival Rates', skiprows=1)

print(survivalrates['2002'].head(5))

Secondly, we can drop the columns that represent the survival rates in numbers as this data is better represented as a percentage of business 'births' in that year. We can also drop the 'Births' column as it is not relevant to answering our business questions.

In [None]:
# Removing 'Births' and the columns with survival rates in numbers 
for i in range(2002, 2019):
    survivalrates[str(i)].drop(survivalrates[str(i)].columns[[2, 3, 5, 7, 9, 11]], axis=1, inplace=True)

print(survivalrates['2002'].head(5))

We can also rename the column names as we know from viewing the dataframe previously that the 'Per cent' columns are the survival rates in percentage for 1, 2, 3, 4 and 5 years in that order.

In [None]:
# Rename columns
for i in range(2002, 2019):
    survivalrates[str(i)].rename(columns={'Per cent': '1 Year Survival in %', 'Per cent.1': '2 Year Survival in %', 'Per cent.2': '3 Year Survival in %','Per cent.3': '4 Year Survival in %','Per cent.4': '5 Year Survival in %',}, inplace=True)

print(survivalrates['2002'].head(5))

Thirdly, we observe that there is an empty row at the start of the data, so we want to test if there are any other empty rows or cells in the dataset.

In [None]:
# Check for missing values
print(survivalrates['2002'].isnull().sum())
print(survivalrates['2002'].isna().sum())

missing_rows_na = survivalrates['2002'][survivalrates['2002'].isna().any(axis=1)]
print(missing_rows_na)

We observe that rows 0, 34, 37, 47 and 52 are empty, so we can drop these rows. To better understand the data and why there are empty rows, we can view all the data.

In [None]:
# Print the whole sheet
print(survivalrates['2002'])

After row 33, the data is on different regions in London. As this information is already contained in the above rows which detail the survival rates in different boroughs of London, we can remove the rows below 33. We will also rename the column 'Area' to 'Borough' to better reflect the information.

In [None]:
# Only keep rows with borough information
for i in range(2002, 2019):
    survivalrates[str(i)] = survivalrates[str(i)].iloc[1:34]

# Rename 'Area' to 'Borough'
for i in range(2002, 2019):
    survivalrates[str(i)].rename(columns={'Area': 'Borough'}, inplace = True)  

print(survivalrates['2002'])

Logically, as we know that the data in this dataset only goes up to 2019, we know that later years would not have all the data for survival rates for surviving more than a year. Thus, we need to check the other sheets as well.

In [None]:
# Print the 2018 Survival Rates sheet
print(survivalrates['2018'].head(5))

This means we need to remove the columns with ':'.

In [None]:
# Remove columns that contain ':'
for i in range(2002, 2019):
    survivalrates[str(i)].drop(columns = survivalrates[str(i)].columns[(survivalrates[str(i)] == ':').any()], inplace = True)

print(survivalrates['2017'].head(5))

As we are dealing with a lot of numbers, it is also important to check if the data type of each column is stored as numbers and not as strings. It would also be good to standardise that they are one decimal place.

In [None]:
# Check the data type of each column
print(survivalrates['2002'].info(verbose=True))

# Round all numbers to one decimal place
for i in range(2002, 2019):
    survivalrates[str(i)] = survivalrates[str(i)].round(decimals = 1)

print(survivalrates['2004'])

We will also set the index to 'Borough'. It will also be noted that the column 'Code' is kept as it might be useful in the future when merging data with map data etc.

In [None]:
# Set the index to the column 'Borough'
for i in range(2002, 2019):
    survivalrates[str(i)].set_index('Borough', inplace=True)

# Active Enterprises
Another sheet in the Excel file that would be relevant to answering the business question is 'Active Enterprises by year'. 

In [None]:
# Read the 'Active Enterprises by year' sheet
activeenterprises = pd.read_excel('data/business-demographics.xlsx', sheet_name= 'Active Enterprises by year')

print(activeenterprises)

As the structure is similiar to that of the previous sheets with survival rates except without incorrect column names, we will perform the same data cleaning process to remove empty rows, create a new dataframe with only the relevant data, rename 'Area' to 'Borough' and convert the column names to strings. To confirm, we will also check for empty values.

In [None]:
# Only keep rows with borough information
activeenterprises = activeenterprises.iloc[1:34]

# Rename 'Area' to 'Borough'
activeenterprises.rename(columns={'Area': 'Borough'}, inplace = True)  

print(activeenterprises)

In [None]:
# Check for missing values
print(activeenterprises.isnull().sum())
print(activeenterprises.isna().sum())

In [None]:
# Convert the column names from integers to string
activeenterprises.columns = activeenterprises.columns.map(str)

# Check the data type of each column
print(activeenterprises.info(verbose=True))


As the births are in float, we can convert them to integers as it makes more sense based on the context of the data.

In [None]:
# Convert the numerical data to int
for i in range (2002, 2020):
    activeenterprises[str(i)] = activeenterprises[str(i)].astype(int)

print(activeenterprises.head(5))

We will also set the index to 'Borough'.

In [None]:
# Set the index to the column 'Borough'
activeenterprises.set_index('Borough', inplace=True)

# Enterprise Deaths
The last sheets that would be useful is 'Enterprise deaths by year' as we are concerned with looking into the death rate of enterprises. 

In [None]:
# Read the 'Enterprise deaths by year' sheet
enterprisedeaths = pd.read_excel('data/business-demographics.xlsx', sheet_name= 'Enterprise deaths by year')

print(enterprisedeaths.columns)
print(enterprisedeaths.head(5))

Looking at the data, we observe that there are two types of columns, the numbers of businesses that 'died' in that year and this number as a percentage of the number of active enterprises. It is more useful to analyse the percentage of business that die in that year out of the active enterprises so that a more effective comparison between boroughs can be realised, thus, we will only keep this data.

In [None]:
# Read the sheets and skip a row
enterprisedeaths = pd.read_excel('data/business-demographics.xlsx', sheet_name= 'Enterprise deaths by year', skiprows=1)

# Rename 'Area' to 'Borough'
enterprisedeaths.rename(columns={'Area': 'Borough'}, inplace = True) 

# Convert the column names from integers to string
enterprisedeaths.columns = enterprisedeaths.columns.map(str)

# Remove columns with number of businesses
for i in range (2004, 2020):
    enterprisedeaths.drop([str(i)], axis = 1, inplace = True)

# Drop columns with only null values
enterprisedeaths.dropna(axis = 1, how = 'all', inplace = True)

# Rename columns by removing '.1' at the end 
enterprisedeaths.columns = enterprisedeaths.columns.str.strip(to_strip = '.1')
enterprisedeaths.columns = enterprisedeaths.columns.map(str)

# Only keep rows with borough information
enterprisedeaths = enterprisedeaths.iloc[1:34]

# Round all numbers to one decimal place
enterprisedeaths = enterprisedeaths.round(decimals = 1)

# Set the index to the column 'Borough'
enterprisedeaths.set_index('Borough', inplace=True)

print(enterprisedeaths.columns)
print(enterprisedeaths.head(5))

We observe that column name for the year 2011 was wrongly stripped as well, so we will fix that.

In [None]:
# Rename the column '20' to '2011'
enterprisedeaths.rename(columns={'20': '2011'}, inplace = True)

print(enterprisedeaths.columns)
print(enterprisedeaths.head(5))

Now, as the data is prepared, we will save the edited file as a new Excel file.

In [None]:
# Save only the relevant sheets 
with pd.ExcelWriter('data/prepared_data1.xlsx') as writer:  
    activeenterprises.to_excel(writer, sheet_name='Active Enterprises by Year')
    enterprisedeaths.to_excel(writer, sheet_name='Death Rates in % by Year')
    for i in range(2002, 2019):
        survivalrates[str(i)].to_excel(writer, sheet_name= str(i) + ' Survival Rates')