In [1]:
import pandas as pd
import os
import glob
import math
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

Create 2011 gentrification prediction dataset:

1. Get list of MSOAs for each city as base table
2. Merge census variables to base table
3. Merge additional tables to base table
4. On a copy of the base table from 1, label gentrification using 2011 & 2021 data (nevermind, doing PCA)

Freeman gentrification labeling method:

a census tract is (1) marked eligible if (a) housing construction is below a metropolitan median, and (b) income is below the median, and (c) the tract is located in a central city. It is then (2) marked as gentrified if, after the measurement period, (a) there is a greater increase in educational attainment compared to the median and (b) an increase in real housing prices


Data sources:

2011 construction:
no direct numbers at the lsoa or msoa level, only regional & country stats
can use c2011ks401ew & c2011ks401uk, which contains number of household dwellings, subtract 2001 numbers, to get new dwellings as a proxy

2011 income:
.\data\raw\govuk\1smallareaincomeestimatesdatatcm77420299.xls

2011 cities:
.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv

2011 educational attainment:
c2011ks501ew

2021 educational attainment:
c2021ts067

2011 house prices:
data\raw\ons\hpssamedianpricebymsoa.xlsx

2021 house prices:
data\raw\ons\hpssamedianpricebymsoa.xlsx

In [2]:
# print(os.getcwd())
# os.chdir("..")
# print(os.getcwd())

In [3]:
try:
    base2011_df = pd.read_csv(r'.\data\raw\geoportal\Lower_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv')
except:  
    os.chdir("..")
    base2011_df = pd.read_csv(r'.\data\raw\geoportal\Lower_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv')
base2011_df = base2011_df.dropna(axis=0, how='any', ignore_index=True)
base2011_df = base2011_df.drop(columns='FID')
base2011_df

Unnamed: 0,LSOA11CD,LSOA11NM,TCITY15CD,TCITY15NM
0,E01000001,City of London 001A,J01000055,London
1,E01000205,Barnet 035A,J01000055,London
2,E01000002,City of London 001B,J01000055,London
3,E01000003,City of London 001C,J01000055,London
4,E01000206,Barnet 033B,J01000055,London
...,...,...,...,...
16037,W01001703,Cardiff 045B,J01000020,Cardiff
16038,W01001857,Cardiff 042C,J01000020,Cardiff
16039,W01001858,Cardiff 034A,J01000020,Cardiff
16040,W01001859,Cardiff 034B,J01000020,Cardiff


In [4]:
metadata_2011 = pd.read_csv(r'.\data\raw\api\lsoa\counts\c2011\metadata.csv')
metadata_2011['clean_mnemonic'] = metadata_2011['mnemonic'].str.extract('(c2011ks\d*).*')
metadata_2011_mnemonics = list(set(metadata_2011['clean_mnemonic']))
len(metadata_2011_mnemonics)

35

In [5]:
c2011_dfs = []

for mnemonic in metadata_2011_mnemonics:

    root_dir = r'.\data\raw\api\lsoa\counts\c2011'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2011_dfs.append(mnemonic_df)
c2011_dfs[0]

ValueError: No objects to concatenate

In [33]:
# c2011_dfs[0][c2011_dfs[0]['GEOGRAPHY_CODE'] == 'E02000053']

CELL_NAME,GEOGRAPHY_CODE,All categories: Ethnic group,All usual residents,Asian / Asian British: Bangladeshi,Asian / Asian British: Chinese,Asian / Asian British: Indian,Asian / Asian British: Other Asian,Asian / Asian British: Pakistani,Asian/Asian British,Asian/Asian British: Bangladeshi,...,Mixed/multiple ethnic groups: White and Black Caribbean,Other Ethnic Group,Other ethnic group,Other ethnic group: Any other ethnic group,Other ethnic group: Arab,White,White: English/Welsh/Scottish/Northern Irish/British,White: Gypsy or Irish Traveller,White: Irish,White: Other White
51,E02000053,8697.0,8697.0,41.0,570.0,992.0,1113.0,420.0,3136.0,41.0,...,74.0,459.0,459.0,224.0,235.0,3562.5,1783.0,1.0,246.0,1533.0


In [34]:
c2011_census_df = base2011_df.copy()
for df in c2011_dfs:
    c2011_census_df = c2011_census_df.merge(df, how='left', left_on='LSOA11CD', right_on='GEOGRAPHY_CODE', suffixes=('', '_y'))
    c2011_census_df_cols = list(c2011_census_df.columns)
    c2011_census_df_dup_cols = [col for col in c2011_census_df_cols if '_y' in col]
    c2011_census_df_dup_cols.append('GEOGRAPHY_CODE')
    c2011_census_df = c2011_census_df.drop(columns=c2011_census_df_dup_cols)
c2011_census_df = c2011_census_df.dropna(axis=1, how='all')
c2011_census_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All categories: Ethnic group,All usual residents,Asian / Asian British: Bangladeshi,Asian / Asian British: Chinese,Asian / Asian British: Indian,Asian / Asian British: Other Asian,...,Not living in a couple: Divorced or formerly in a same-sex civil partnership which is now legally dissolved,Not living in a couple: Married or in a registered same-sex civil partnership,Not living in a couple: Separated (but still legally married or still legally in a same-sex civil partnership),Not living in a couple: Single (never married or never registered a same-sex civil partnership),Not living in a couple: Widowed or surviving partner from a same-sex civil partnership,All usual residents aged 3 and over,Can speak and read but cannot write Welsh,Can speak but cannot read or write Welsh,No skills in Welsh,Other combination of skills in Welsh
0,E02000053,Barnet 030,J01000055,London,8697.0,8697.0,41.0,570.0,992.0,1113.0,...,348.0,317.0,191.0,2370.0,268.0,,,,,
1,E02000209,Croydon 016,J01000055,London,8414.0,8414.0,75.0,82.0,1216.0,1488.0,...,352.0,221.0,202.0,2091.0,268.0,,,,,
2,E02000054,Barnet 031,J01000055,London,7206.0,7206.0,58.0,383.0,704.0,519.0,...,374.0,232.0,131.0,2019.0,218.0,,,,,
3,E02000055,Barnet 032,J01000055,London,11130.0,11130.0,57.0,485.0,924.0,711.0,...,480.0,324.0,161.0,3247.0,459.0,,,,,
4,E02000056,Barnet 033,J01000055,London,9212.0,9212.0,18.0,95.0,339.0,365.0,...,325.0,194.0,104.0,1794.0,345.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,9380.0,9380.0,1.0,65.0,20.0,21.0,...,371.0,57.0,113.0,1397.0,604.0,,,,,
3256,E02005068,Maidstone 001,J01000022,Chatham,7042.0,7042.0,0.0,12.0,49.0,23.0,...,301.0,65.0,98.0,1108.0,212.0,,,,,
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,7848.0,7848.0,9.0,37.0,87.0,26.0,...,348.0,42.0,91.0,1181.0,463.0,,,,,
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,10127.0,10127.0,104.0,108.0,169.0,118.0,...,475.0,116.0,148.0,2243.0,388.0,,,,,


In [35]:
income2011_df = pd.read_excel(r'.\data\raw\govuk\1smallareaincomeestimatesdatatcm77420299.xls', sheet_name='Total weekly income', header=4)
income2011_df = income2011_df[['MSOA code', 'Total weekly income (£)']]
income2011_df = income2011_df.dropna(axis=0, how='any')
income2011_df

c2011_income_df = c2011_census_df.copy()
c2011_income_df = c2011_income_df.merge(income2011_df, how='left', left_on='MSOA11CD', right_on='MSOA code')
c2011_income_df = c2011_income_df.drop(columns='MSOA code')
c2011_income_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All categories: Ethnic group,All usual residents,Asian / Asian British: Bangladeshi,Asian / Asian British: Chinese,Asian / Asian British: Indian,Asian / Asian British: Other Asian,...,Not living in a couple: Married or in a registered same-sex civil partnership,Not living in a couple: Separated (but still legally married or still legally in a same-sex civil partnership),Not living in a couple: Single (never married or never registered a same-sex civil partnership),Not living in a couple: Widowed or surviving partner from a same-sex civil partnership,All usual residents aged 3 and over,Can speak and read but cannot write Welsh,Can speak but cannot read or write Welsh,No skills in Welsh,Other combination of skills in Welsh,Total weekly income (£)
0,E02000053,Barnet 030,J01000055,London,8697.0,8697.0,41.0,570.0,992.0,1113.0,...,317.0,191.0,2370.0,268.0,,,,,,810.0
1,E02000209,Croydon 016,J01000055,London,8414.0,8414.0,75.0,82.0,1216.0,1488.0,...,221.0,202.0,2091.0,268.0,,,,,,810.0
2,E02000054,Barnet 031,J01000055,London,7206.0,7206.0,58.0,383.0,704.0,519.0,...,232.0,131.0,2019.0,218.0,,,,,,950.0
3,E02000055,Barnet 032,J01000055,London,11130.0,11130.0,57.0,485.0,924.0,711.0,...,324.0,161.0,3247.0,459.0,,,,,,870.0
4,E02000056,Barnet 033,J01000055,London,9212.0,9212.0,18.0,95.0,339.0,365.0,...,194.0,104.0,1794.0,345.0,,,,,,1300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,9380.0,9380.0,1.0,65.0,20.0,21.0,...,57.0,113.0,1397.0,604.0,,,,,,690.0
3256,E02005068,Maidstone 001,J01000022,Chatham,7042.0,7042.0,0.0,12.0,49.0,23.0,...,65.0,98.0,1108.0,212.0,,,,,,1020.0
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,7848.0,7848.0,9.0,37.0,87.0,26.0,...,42.0,91.0,1181.0,463.0,,,,,,960.0
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,10127.0,10127.0,104.0,108.0,169.0,118.0,...,116.0,148.0,2243.0,388.0,,,,,,940.0


In [36]:
housing2011_df = pd.read_excel(r'.\data\raw\ons\hpssamedianpricebymsoa.xlsx', sheet_name='1a', header=2)
housing2011_df = housing2011_df[['MSOA code', 'Year ending Dec 2011']]
housing2011_df = housing2011_df.rename(columns={'Year ending Dec 2011': 'Median house price'})
housing2011_df

c2011_housing_df = c2011_income_df.copy()
c2011_housing_df = c2011_housing_df.merge(housing2011_df, how='left', left_on='MSOA11CD', right_on='MSOA code')
c2011_housing_df = c2011_housing_df.drop(columns='MSOA code')
c2011_housing_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All categories: Ethnic group,All usual residents,Asian / Asian British: Bangladeshi,Asian / Asian British: Chinese,Asian / Asian British: Indian,Asian / Asian British: Other Asian,...,Not living in a couple: Separated (but still legally married or still legally in a same-sex civil partnership),Not living in a couple: Single (never married or never registered a same-sex civil partnership),Not living in a couple: Widowed or surviving partner from a same-sex civil partnership,All usual residents aged 3 and over,Can speak and read but cannot write Welsh,Can speak but cannot read or write Welsh,No skills in Welsh,Other combination of skills in Welsh,Total weekly income (£),Median house price
0,E02000053,Barnet 030,J01000055,London,8697.0,8697.0,41.0,570.0,992.0,1113.0,...,191.0,2370.0,268.0,,,,,,810.0,
1,E02000209,Croydon 016,J01000055,London,8414.0,8414.0,75.0,82.0,1216.0,1488.0,...,202.0,2091.0,268.0,,,,,,810.0,191750
2,E02000054,Barnet 031,J01000055,London,7206.0,7206.0,58.0,383.0,704.0,519.0,...,131.0,2019.0,218.0,,,,,,950.0,330000
3,E02000055,Barnet 032,J01000055,London,11130.0,11130.0,57.0,485.0,924.0,711.0,...,161.0,3247.0,459.0,,,,,,870.0,325000
4,E02000056,Barnet 033,J01000055,London,9212.0,9212.0,18.0,95.0,339.0,365.0,...,104.0,1794.0,345.0,,,,,,1300.0,890000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,9380.0,9380.0,1.0,65.0,20.0,21.0,...,113.0,1397.0,604.0,,,,,,690.0,175000
3256,E02005068,Maidstone 001,J01000022,Chatham,7042.0,7042.0,0.0,12.0,49.0,23.0,...,98.0,1108.0,212.0,,,,,,1020.0,196250
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,7848.0,7848.0,9.0,37.0,87.0,26.0,...,91.0,1181.0,463.0,,,,,,960.0,242000
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,10127.0,10127.0,104.0,108.0,169.0,118.0,...,148.0,2243.0,388.0,,,,,,940.0,202750


In [37]:
c2011_df = c2011_housing_df.copy()
c2011_df = c2011_df.dropna(axis=1, how='all')
c2011_df["Central and South America"] = c2011_df["South America"] + c2011_df["Central America"]
c2011_df["3 or more cars or vans in household"] = c2011_df["3 cars or vans in household"] + c2011_df["4 or more cars or vans in household"]
c2011_df["Married or in a registered civil partnership"] = c2011_df["Married"] + c2011_df["In a registered same-sex civil partnership"]
c2011_df["Age 15 to 19 years"] = c2011_df["Age 15"] + c2011_df["Age 16 to 17"] + c2011_df["Age 18 to 19"]
c2011_df["Aged 5 to 9 years"] = c2011_df["Age 5 to 7"] + c2011_df["Age 8 to 9"]
c2011_df["Aged 85 years and over"] = c2011_df["Age 85 to 89"] + c2011_df["Age 90 and over"]
c2011_df["Age: 16 years and over"] = c2011_df["All usual residents aged 16 to 74"] + c2011_df["Age 75 to 84"] + c2011_df["Aged 85 years and over"]
c2011_df.to_csv(r'.\data\clean\c2011.csv', index=False)

In [38]:
base2001_df = pd.read_csv(r'.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2001)_to_Middle_Layer_Super_Output_Area_(2011)_to_Local_Authority_District_(2011)_Lookup_in_England_and_Wales.csv')
base2001_df = base2001_df[['MSOA01CD', 'MSOA11CD']]
base2001_df = base2001_df.merge(base2011_df, on='MSOA11CD')
base2001_df

Unnamed: 0,MSOA01CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM
0,E02000001,E02000001,City of London 001,J01000055,London
1,E02000101,E02000101,Brent 009,J01000055,London
2,E02000002,E02000002,Barking and Dagenham 001,J01000055,London
3,E02000003,E02000003,Barking and Dagenham 002,J01000055,London
4,E02000102,E02000102,Brent 010,J01000055,London
...,...,...,...,...,...
3271,W02000410,W02000410,Cardiff 044,J01000020,Cardiff
3272,W02000411,W02000411,Cardiff 045,J01000020,Cardiff
3273,W02000412,W02000412,Cardiff 046,J01000020,Cardiff
3274,W02000413,W02000422,Cardiff 048,J01000020,Cardiff


In [39]:
metadata_2001 = pd.read_csv(r'.\data\raw\api\msoa\c2001\metadata.csv')
metadata_2001['clean_mnemonic'] = metadata_2001['mnemonic'].str.extract('(ks.*)')
metadata_2001 = metadata_2001[metadata_2001['geoglevel'].str.contains('msoa')]
metadata_2001_mnemonics = list(set(metadata_2001['clean_mnemonic']))
len(metadata_2001_mnemonics)

32

In [40]:
c2001_dfs = []

for mnemonic in metadata_2001_mnemonics:

    root_dir = r'.\data\raw\api\msoa\c2001'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2001_dfs.append(mnemonic_df)
c2001_dfs[0]

C_INDGPUK11_NAME,GEOGRAPHY_CODE,"A Agriculture, hunting, forestry",All categories: Industry,B Fishing,C Mining and quarrying,D Manufacturing,"E Electricity, gas and water supply",F Constructiion,"G Wholesale and retail trade, repair of motor vehicles",H Hotels and restaurants,I Transport storage and communications,J Financial Intermediation,"K Real estate,renting and business activities","L Public administration and defence, social security",M Education,N Health and social work,"O,P,Q Other"
0,E02000001,5.0,4290.0,0.0,14.0,202.0,8.0,44.0,224.0,167.0,181.0,757.0,1470.0,255.0,229.0,341.0,393.0
1,E02000002,8.0,2170.0,0.0,3.0,193.0,5.0,225.0,420.0,85.0,230.0,156.0,286.0,88.0,116.0,246.0,109.0
2,E02000003,15.0,4276.0,0.0,3.0,391.0,24.0,407.0,709.0,129.0,401.0,435.0,608.0,239.0,259.0,421.0,235.0
3,E02000004,4.0,2813.0,0.0,3.0,255.0,17.0,274.0,462.0,84.0,242.0,343.0,341.0,152.0,201.0,292.0,143.0
4,E02000005,13.0,3309.0,3.0,3.0,367.0,15.0,360.0,580.0,123.0,395.0,246.0,353.0,162.0,210.0,320.0,159.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7189,W02000409,6.0,2368.0,0.0,0.0,245.0,32.0,167.0,465.0,148.0,178.0,117.0,233.0,164.0,171.0,280.0,162.0
7190,W02000410,9.0,2652.0,3.0,0.0,301.0,59.0,145.0,481.0,212.0,279.0,112.0,295.0,147.0,158.0,243.0,208.0
7191,W02000411,4.0,2207.0,0.0,0.0,232.0,33.0,171.0,502.0,129.0,179.0,112.0,223.0,98.0,116.0,269.0,139.0
7192,W02000412,6.0,2440.0,0.0,3.0,287.0,54.0,154.0,457.0,150.0,224.0,112.0,276.0,157.0,168.0,238.0,154.0


In [41]:
c2001_census_df = base2001_df.copy()
for df in c2001_dfs:
    c2001_census_df = c2001_census_df.merge(df, how='left', left_on='MSOA01CD', right_on='GEOGRAPHY_CODE', suffixes=('', '_y'))
    c2001_census_df_cols = list(c2001_census_df.columns)
    c2001_census_df_dup_cols = [col for col in c2001_census_df_cols if '_y' in col]
    c2001_census_df_dup_cols.append('GEOGRAPHY_CODE')
    c2001_census_df = c2001_census_df.drop(columns=c2001_census_df_dup_cols)
c2001_census_df = c2001_census_df.dropna(axis=1, how='all')
c2001_census_df

Unnamed: 0,MSOA01CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,"A Agriculture, hunting, forestry",All categories: Industry,B Fishing,C Mining and quarrying,D Manufacturing,...,All household spaces - with no residents - second residence/holiday accommodation,All household spaces - with no residents - vacant,All household spaces - with residents,Caravan or other mobile or temporary structure,"Flat, maisonette or apartment - in a commercial building","Flat, maisonette or apartment - in a purpose built block of flats or tenement","Flat, maisonette or apartment - part of a converted or shared house (includes bed-sit)",Whole house or bungalow - detatched,Whole house or bungalow - semi-detatched,Whole house or bungalow - terraced
0,E02000001,E02000001,City of London 001,J01000055,London,5.0,4290.0,0.0,14.0,202.0,...,483.0,203.0,4338.0,3.0,278.0,4496.0,147.0,15.0,13.0,74.0
1,E02000101,E02000101,Brent 009,J01000055,London,8.0,3041.0,0.0,9.0,227.0,...,3.0,175.0,2857.0,0.0,48.0,1447.0,201.0,464.0,609.0,264.0
2,E02000002,E02000002,Barking and Dagenham 001,J01000055,London,8.0,2170.0,0.0,3.0,193.0,...,5.0,65.0,2734.0,3.0,17.0,975.0,32.0,89.0,759.0,930.0
3,E02000003,E02000003,Barking and Dagenham 002,J01000055,London,15.0,4276.0,0.0,3.0,391.0,...,0.0,66.0,3784.0,3.0,74.0,819.0,144.0,127.0,896.0,1790.0
4,E02000102,E02000102,Brent 010,J01000055,London,6.0,3239.0,0.0,4.0,292.0,...,0.0,76.0,2727.0,7.0,15.0,664.0,270.0,122.0,531.0,1195.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,W02000410,W02000410,Cardiff 044,J01000020,Cardiff,9.0,2652.0,3.0,0.0,301.0,...,3.0,97.0,2753.0,3.0,73.0,163.0,228.0,73.0,180.0,2132.0
3272,W02000411,W02000411,Cardiff 045,J01000020,Cardiff,4.0,2207.0,0.0,0.0,232.0,...,3.0,86.0,2420.0,0.0,15.0,530.0,15.0,211.0,1126.0,609.0
3273,W02000412,W02000412,Cardiff 046,J01000020,Cardiff,6.0,2440.0,0.0,3.0,287.0,...,3.0,58.0,2489.0,0.0,11.0,340.0,151.0,134.0,578.0,1334.0
3274,W02000413,W02000422,Cardiff 048,J01000020,Cardiff,6.0,2529.0,0.0,3.0,277.0,...,37.0,191.0,2957.0,3.0,38.0,1680.0,114.0,79.0,387.0,884.0


In [42]:
housing2001_df = pd.read_excel(r'.\data\raw\ons\hpssamedianpricebymsoa.xlsx', sheet_name='1a', header=2)
housing2001_df = housing2001_df[['MSOA code', 'Year ending Dec 2001']]
housing2001_df = housing2001_df.rename(columns={'Year ending Dec 2001': 'Median house price'})
housing2001_df

c2001_housing_df = c2001_census_df.copy()
c2001_housing_df = c2001_housing_df.merge(housing2001_df, how='left', left_on='MSOA01CD', right_on='MSOA code')
c2001_housing_df = c2001_housing_df.drop(columns='MSOA code')
c2001_housing_df

Unnamed: 0,MSOA01CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,"A Agriculture, hunting, forestry",All categories: Industry,B Fishing,C Mining and quarrying,D Manufacturing,...,All household spaces - with no residents - vacant,All household spaces - with residents,Caravan or other mobile or temporary structure,"Flat, maisonette or apartment - in a commercial building","Flat, maisonette or apartment - in a purpose built block of flats or tenement","Flat, maisonette or apartment - part of a converted or shared house (includes bed-sit)",Whole house or bungalow - detatched,Whole house or bungalow - semi-detatched,Whole house or bungalow - terraced,Median house price
0,E02000001,E02000001,City of London 001,J01000055,London,5.0,4290.0,0.0,14.0,202.0,...,203.0,4338.0,3.0,278.0,4496.0,147.0,15.0,13.0,74.0,237500
1,E02000101,E02000101,Brent 009,J01000055,London,8.0,3041.0,0.0,9.0,227.0,...,175.0,2857.0,0.0,48.0,1447.0,201.0,464.0,609.0,264.0,130000
2,E02000002,E02000002,Barking and Dagenham 001,J01000055,London,8.0,2170.0,0.0,3.0,193.0,...,65.0,2734.0,3.0,17.0,975.0,32.0,89.0,759.0,930.0,95000
3,E02000003,E02000003,Barking and Dagenham 002,J01000055,London,15.0,4276.0,0.0,3.0,391.0,...,66.0,3784.0,3.0,74.0,819.0,144.0,127.0,896.0,1790.0,115000
4,E02000102,E02000102,Brent 010,J01000055,London,6.0,3239.0,0.0,4.0,292.0,...,76.0,2727.0,7.0,15.0,664.0,270.0,122.0,531.0,1195.0,115500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,W02000410,W02000410,Cardiff 044,J01000020,Cardiff,9.0,2652.0,3.0,0.0,301.0,...,97.0,2753.0,3.0,73.0,163.0,228.0,73.0,180.0,2132.0,65000
3272,W02000411,W02000411,Cardiff 045,J01000020,Cardiff,4.0,2207.0,0.0,0.0,232.0,...,86.0,2420.0,0.0,15.0,530.0,15.0,211.0,1126.0,609.0,45000
3273,W02000412,W02000412,Cardiff 046,J01000020,Cardiff,6.0,2440.0,0.0,3.0,287.0,...,58.0,2489.0,0.0,11.0,340.0,151.0,134.0,578.0,1334.0,71000
3274,W02000413,W02000422,Cardiff 048,J01000020,Cardiff,6.0,2529.0,0.0,3.0,277.0,...,191.0,2957.0,3.0,38.0,1680.0,114.0,79.0,387.0,884.0,


In [43]:
c2001_df = c2001_housing_df.copy()
c2001_df = c2001_df.dropna(axis=1, how='all')
c2001_df["Provides no unpaid care"] = c2001_df["All usual residents"] - c2001_df["All people who provide unpaid care"]
c2001_df["3 or more cars or vans in household"] = c2001_df["3 cars or vans in household"] + c2001_df["4 or more cars or vans in household"]
c2001_df["Married or in a registered civil partnership"] = c2001_df["Married (first marriage)"] + c2001_df["Re-married"]
c2001_df["Private rented: Other"] = c2001_df["Rented from a housing association/registered social landlord"] + c2001_df["Rented from council(local authority)"]
c2001_df["Age 15 to 19 years"] = c2001_df["Age 15"] + c2001_df["Age 16 to 17"] + c2001_df["Age 18 to 19"]
c2001_df["Aged 5 to 9 years"] = c2001_df["Age 5 to 7"] + c2001_df["Age 8 to 9"]
c2001_df["Aged 85 years and over"] = c2001_df["Age 85 to 89"] + c2001_df["Age 90 and over"]
c2001_df["Age: 16 years and over"] = c2001_df["All usual residents aged 16 to 74"] + c2001_df["Age 75 to 84"] + c2001_df["Aged 85 years and over"]
c2001_df.to_csv(r'.\data\clean\c2001.csv', index=False)

In [44]:
base2021_df = pd.read_csv(r'.\data\raw\geoportal\MSOA_(2011)_to_MSOA_(2021)_to_Local_Authority_District_(2022)_Lookup_for_England_and_Wales_(Version_2).csv')
base2021_df = base2021_df[['MSOA21CD', 'MSOA11CD']]
base2021_df = base2021_df.merge(base2011_df, on='MSOA11CD')
base2021_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM
0,E02000308,E02000308,Enfield 032,J01000055,London
1,E02000309,E02000309,Enfield 033,J01000055,London
2,E02000311,E02000311,Enfield 035,J01000055,London
3,E02000312,E02000312,Enfield 036,J01000055,London
4,E02000313,E02000313,Greenwich 001,J01000055,London
...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea


In [45]:
metadata_2021 = pd.read_csv(r'.\data\raw\api\msoa\c2021\metadata.csv')
metadata_2021['clean_mnemonic'] = metadata_2021['mnemonic'].str.extract('(c2021ts.*)')
metadata_2021 = metadata_2021[metadata_2021['geoglevel'].str.contains('msoa')]
metadata_2021_mnemonics = list(set(metadata_2021['clean_mnemonic']))
len(metadata_2021_mnemonics)

68

In [46]:
c2021_dfs = []

for mnemonic in metadata_2021_mnemonics:
    root_dir = r'.\data\raw\api\msoa\c2021'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2021_dfs.append(mnemonic_df)
c2021_dfs[37]

C2021_CARS_5_NAME,GEOGRAPHY_CODE,1 car or van in household,2 cars or vans in household,3 or more cars or vans in household,No cars or vans in household,Total: All households
0,E02000001,954.0,123.0,43.0,3793.0,4913.0
1,E02000002,1273.0,547.0,136.0,914.0,2870.0
2,E02000003,1841.0,719.0,214.0,1156.0,3930.0
3,E02000004,984.0,573.0,208.0,544.0,2309.0
4,E02000005,1729.0,713.0,170.0,1016.0,3628.0
...,...,...,...,...,...,...
7259,W02000424,1148.0,1076.0,469.0,277.0,2970.0
7260,W02000425,993.0,1154.0,486.0,240.0,2873.0
7261,W02000426,1892.0,1327.0,450.0,725.0,4394.0
7262,W02000427,2162.0,1207.0,463.0,1055.0,4887.0


In [47]:
c2021_census_df = base2021_df.copy()
for df in c2021_dfs:
    c2021_census_df = c2021_census_df.merge(df, how='left', left_on='MSOA21CD', right_on='GEOGRAPHY_CODE', suffixes=('', '_y'))
    c2021_census_df_cols = list(c2021_census_df.columns)
    c2021_census_df_dup_cols = [col for col in c2021_census_df_cols if '_y' in col]
    c2021_census_df_dup_cols.append('GEOGRAPHY_CODE')
    c2021_census_df = c2021_census_df.drop(columns=c2021_census_df_dup_cols)
c2021_census_df = c2021_census_df.dropna(axis=1, how='all')
c2021_census_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,Has not previously served in any UK armed forces,Previously served in UK regular armed forces,Previously served in UK reserve armed forces,Previously served in both regular and reserve UK armed forces,Total: All usual residents aged 16 and over,...,922 Elementary Cleaning Occupations,923 Elementary Security Occupations,924 Elementary Sales Occupations,925 Elementary Storage Occupations,926 Other Elementary Services Occupations,Household is deprived in four dimensions,Household is deprived in one dimension,Household is deprived in three dimensions,Household is deprived in two dimensions,Household is not deprived in any dimension
0,E02000308,E02000308,Enfield 032,J01000055,London,6970.0,46.0,26.0,7.0,7049.0,...,179.0,33.0,8.0,61.0,159.0,11.0,1117.0,125.0,496.0,1477.0
1,E02000309,E02000309,Enfield 033,J01000055,London,8720.0,44.0,29.0,4.0,8797.0,...,347.0,81.0,13.0,94.0,178.0,26.0,1599.0,272.0,809.0,1281.0
2,E02000311,E02000311,Enfield 035,J01000055,London,8447.0,36.0,28.0,0.0,8511.0,...,233.0,44.0,13.0,59.0,186.0,11.0,1351.0,222.0,662.0,1389.0
3,E02000312,E02000312,Enfield 036,J01000055,London,9237.0,47.0,29.0,5.0,9318.0,...,262.0,65.0,5.0,40.0,196.0,26.0,1656.0,184.0,647.0,1982.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,4780.0,61.0,19.0,3.0,4863.0,...,135.0,126.0,13.0,75.0,78.0,26.0,1009.0,165.0,517.0,878.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,7909.0,129.0,72.0,13.0,8123.0,...,65.0,11.0,13.0,79.0,233.0,7.0,1181.0,160.0,592.0,1598.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,5500.0,129.0,51.0,7.0,5687.0,...,28.0,8.0,4.0,34.0,70.0,6.0,756.0,33.0,231.0,1394.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,5049.0,194.0,54.0,16.0,5313.0,...,38.0,13.0,5.0,18.0,54.0,8.0,1013.0,101.0,437.0,1450.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,5914.0,241.0,63.0,12.0,6230.0,...,21.0,10.0,2.0,17.0,65.0,4.0,1093.0,37.0,253.0,1996.0


In [48]:
list(c2021_census_df.columns)

['MSOA21CD',
 'MSOA11CD',
 'MSOA11NM',
 'TCITY15CD',
 'TCITY15NM',
 'Has not previously served in any UK armed forces',
 'Previously served in UK regular armed forces',
 'Previously served in UK reserve armed forces',
 'Previously served in both regular and reserve UK armed forces',
 'Total: All usual residents aged 16 and over',
 'Multi-person household',
 'Multi-person household: At least two different religions stated (household may include people with no religion and who did not state their religion)',
 'Multi-person household: No people stated their religion',
 'Multi-person household: No religion (household may include people who did not state their religion)',
 'Multi-person household: Same religion (at least one person has stated a religion but the household may include people who did not state their religion)',
 'Multi-person household: Same religion and no religion (household may include people who did not state their religion)',
 'One-person household',
 'Total: All househol

In [49]:
income2021_df = pd.read_excel(r'.\data\raw\ons\saiefy1920finalqaddownload280923.xlsx', sheet_name='Total annual income', header=4)
income2021_df = income2021_df[['MSOA code', 'Total annual income (£)']]
income2021_df = income2021_df.dropna(axis=0, how='any')
income2021_df

c2021_income_df = c2021_census_df.copy()
c2021_income_df = c2021_income_df.merge(income2021_df, how='left', left_on='MSOA21CD', right_on='MSOA code')
c2021_income_df = c2021_income_df.drop(columns='MSOA code')
c2021_income_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,Has not previously served in any UK armed forces,Previously served in UK regular armed forces,Previously served in UK reserve armed forces,Previously served in both regular and reserve UK armed forces,Total: All usual residents aged 16 and over,...,923 Elementary Security Occupations,924 Elementary Sales Occupations,925 Elementary Storage Occupations,926 Other Elementary Services Occupations,Household is deprived in four dimensions,Household is deprived in one dimension,Household is deprived in three dimensions,Household is deprived in two dimensions,Household is not deprived in any dimension,Total annual income (£)
0,E02000308,E02000308,Enfield 032,J01000055,London,6970.0,46.0,26.0,7.0,7049.0,...,33.0,8.0,61.0,159.0,11.0,1117.0,125.0,496.0,1477.0,57000.0
1,E02000309,E02000309,Enfield 033,J01000055,London,8720.0,44.0,29.0,4.0,8797.0,...,81.0,13.0,94.0,178.0,26.0,1599.0,272.0,809.0,1281.0,42500.0
2,E02000311,E02000311,Enfield 035,J01000055,London,8447.0,36.0,28.0,0.0,8511.0,...,44.0,13.0,59.0,186.0,11.0,1351.0,222.0,662.0,1389.0,52800.0
3,E02000312,E02000312,Enfield 036,J01000055,London,9237.0,47.0,29.0,5.0,9318.0,...,65.0,5.0,40.0,196.0,26.0,1656.0,184.0,647.0,1982.0,59400.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,4780.0,61.0,19.0,3.0,4863.0,...,126.0,13.0,75.0,78.0,26.0,1009.0,165.0,517.0,878.0,41400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,7909.0,129.0,72.0,13.0,8123.0,...,11.0,13.0,79.0,233.0,7.0,1181.0,160.0,592.0,1598.0,31900.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,5500.0,129.0,51.0,7.0,5687.0,...,8.0,4.0,34.0,70.0,6.0,756.0,33.0,231.0,1394.0,47600.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,5049.0,194.0,54.0,16.0,5313.0,...,13.0,5.0,18.0,54.0,8.0,1013.0,101.0,437.0,1450.0,39200.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,5914.0,241.0,63.0,12.0,6230.0,...,10.0,2.0,17.0,65.0,4.0,1093.0,37.0,253.0,1996.0,47800.0


In [50]:
housing2021_df = pd.read_excel(r'.\data\raw\ons\hpssamedianpricebymsoa.xlsx', sheet_name='1a', header=2)
housing2021_df = housing2021_df[['MSOA code', 'Year ending Dec 2021']]
housing2021_df = housing2021_df.rename(columns={'Year ending Dec 2021': 'Median house price'})
housing2021_df

c2021_housing_df = c2021_income_df.copy()
c2021_housing_df = c2021_housing_df.merge(housing2021_df, how='left', left_on='MSOA21CD', right_on='MSOA code')
c2021_housing_df = c2021_housing_df.drop(columns='MSOA code')
c2021_housing_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,Has not previously served in any UK armed forces,Previously served in UK regular armed forces,Previously served in UK reserve armed forces,Previously served in both regular and reserve UK armed forces,Total: All usual residents aged 16 and over,...,924 Elementary Sales Occupations,925 Elementary Storage Occupations,926 Other Elementary Services Occupations,Household is deprived in four dimensions,Household is deprived in one dimension,Household is deprived in three dimensions,Household is deprived in two dimensions,Household is not deprived in any dimension,Total annual income (£),Median house price
0,E02000308,E02000308,Enfield 032,J01000055,London,6970.0,46.0,26.0,7.0,7049.0,...,8.0,61.0,159.0,11.0,1117.0,125.0,496.0,1477.0,57000.0,468500.0
1,E02000309,E02000309,Enfield 033,J01000055,London,8720.0,44.0,29.0,4.0,8797.0,...,13.0,94.0,178.0,26.0,1599.0,272.0,809.0,1281.0,42500.0,382500.0
2,E02000311,E02000311,Enfield 035,J01000055,London,8447.0,36.0,28.0,0.0,8511.0,...,13.0,59.0,186.0,11.0,1351.0,222.0,662.0,1389.0,52800.0,469975.0
3,E02000312,E02000312,Enfield 036,J01000055,London,9237.0,47.0,29.0,5.0,9318.0,...,5.0,40.0,196.0,26.0,1656.0,184.0,647.0,1982.0,59400.0,577500.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,4780.0,61.0,19.0,3.0,4863.0,...,13.0,75.0,78.0,26.0,1009.0,165.0,517.0,878.0,41400.0,277500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,7909.0,129.0,72.0,13.0,8123.0,...,13.0,79.0,233.0,7.0,1181.0,160.0,592.0,1598.0,31900.0,157750.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,5500.0,129.0,51.0,7.0,5687.0,...,4.0,34.0,70.0,6.0,756.0,33.0,231.0,1394.0,47600.0,324000.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,5049.0,194.0,54.0,16.0,5313.0,...,5.0,18.0,54.0,8.0,1013.0,101.0,437.0,1450.0,39200.0,266000.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,5914.0,241.0,63.0,12.0,6230.0,...,2.0,17.0,65.0,4.0,1093.0,37.0,253.0,1996.0,47800.0,362500.0


In [51]:
c2021_df = c2021_housing_df.copy()
c2021_df = c2021_df.dropna(axis=1, how='all')
c2021_df["Average number of bedrooms per household"] = (c2021_df["1 bedroom"] + c2021_df["2 bedrooms"] * 2 + c2021_df["3 bedrooms"] * 3 + c2021_df["4 or more bedrooms"] * 5) / c2021_df["Number of households"]
c2021_df["Other combination of skills in Welsh"] = c2021_df["Can read and write but cannot speak Welsh"] + c2021_df["Can read but cannot speak or write Welsh"] + c2021_df["Can speak and other combinations of skills in Welsh"] + c2021_df["Can speak, read and write Welsh"] + c2021_df["Can understand spoken Welsh only"] + c2021_df["Can write but cannot speak or read Welsh"]
c2021_df["Economically active: Self-employed"] = c2021_df["Economically active and a full-time student:In employment:Self-employed with employees"] + c2021_df["Economically active and a full-time student:In employment:Self-employed without employees"]
c2021_df["Economically active: Full-time student"] = c2021_df["Economically active and a full-time student"] - c2021_df["Economically active (excluding full-time students)"]
c2021_df["Medical and care establishment: Local Authority: Care home or other home"] = c2021_df["Medical and care establishment: Local Authority: Care home with nursing"] + c2021_df["Medical and care establishment: Local Authority: Care home without nursing"] + c2021_df["Medical and care establishment: Local Authority: Other home"]
c2021_df["Age 30 to 44"] = c2021_df["Aged 30 to 34 years"] + c2021_df["Aged 35 to 39 years"] + c2021_df["Aged 40 to 44 years"]
c2021_df["Age 45 to 59"] = c2021_df["Aged 45 to 49 years"] + c2021_df["Aged 50 to 54 years"] + c2021_df["Aged 55 to 59 years"]
c2021_df["Age 65 to 74"] = c2021_df["Aged 65 to 69 years"] + c2021_df["Aged 70 to 74 years"]
c2021_df["Age 75 to 84"] = c2021_df["Aged 75 to 79 years"] + c2021_df["Aged 80 to 84 years"]

c2021_df.to_csv(r'.\data\clean\c2021.csv', index=False)
c2021_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,Has not previously served in any UK armed forces,Previously served in UK regular armed forces,Previously served in UK reserve armed forces,Previously served in both regular and reserve UK armed forces,Total: All usual residents aged 16 and over,...,Median house price,Average number of bedrooms per household,Other combination of skills in Welsh,Economically active: Self-employed,Economically active: Full-time student,Medical and care establishment: Local Authority: Care home or other home,Age 30 to 44,Age 45 to 59,Age 65 to 74,Age 75 to 84
0,E02000308,E02000308,Enfield 032,J01000055,London,6970.0,46.0,26.0,7.0,7049.0,...,468500.0,2.858648,,34.0,-4267.0,0.0,2210.0,1689.0,562.0,364.0
1,E02000309,E02000309,Enfield 033,J01000055,London,8720.0,44.0,29.0,4.0,8797.0,...,382500.0,2.581244,,25.0,-4661.0,0.0,2583.0,2251.0,602.0,301.0
2,E02000311,E02000311,Enfield 035,J01000055,London,8447.0,36.0,28.0,0.0,8511.0,...,469975.0,2.815406,,31.0,-4885.0,0.0,2390.0,2096.0,790.0,415.0
3,E02000312,E02000312,Enfield 036,J01000055,London,9237.0,47.0,29.0,5.0,9318.0,...,577500.0,2.600222,,39.0,-5901.0,0.0,3207.0,2175.0,822.0,457.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,4780.0,61.0,19.0,3.0,4863.0,...,277500.0,2.348998,,17.0,-2793.0,0.0,1349.0,1488.0,397.0,147.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,7909.0,129.0,72.0,13.0,8123.0,...,157750.0,2.897117,1206.0,42.0,-1364.0,0.0,1082.0,1021.0,526.0,354.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,5500.0,129.0,51.0,7.0,5687.0,...,324000.0,3.584056,1133.0,5.0,-2059.0,0.0,899.0,1017.0,773.0,566.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,5049.0,194.0,54.0,16.0,5313.0,...,266000.0,3.030243,956.0,1.0,-2300.0,0.0,977.0,1231.0,927.0,698.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,5914.0,241.0,63.0,12.0,6230.0,...,362500.0,3.428656,1210.0,5.0,-2853.0,0.0,955.0,1554.0,1187.0,787.0


In [52]:
list(c2001_df.columns)

['MSOA01CD',
 'MSOA11CD',
 'MSOA11NM',
 'TCITY15CD',
 'TCITY15NM',
 'A Agriculture, hunting, forestry',
 'All categories: Industry',
 'B Fishing',
 'C Mining and quarrying',
 'D Manufacturing',
 'E Electricity, gas and water supply',
 'F Constructiion',
 'G Wholesale and retail trade, repair of motor vehicles',
 'H Hotels and restaurants',
 'I Transport storage and communications',
 'J Financial Intermediation',
 'K Real estate,renting and business activities',
 'L Public administration and defence, social security',
 'M Education',
 'N Health and social work',
 'O,P,Q Other',
 'All people',
 'Divorced',
 'Married (first marriage)',
 'Re-married',
 'Separated (but still legally married)',
 'Single (never married)',
 'Widowed',
 'All usual residents aged 16 to 74',
 'Economically Inactive',
 'Economically active',
 'Economically active: Employee: Full-time',
 'Economically active: Employee: Part-time',
 'Economically active: Full-time student',
 'Economically active: In employment',
 'E

In [57]:
normalizer_df = pd.read_csv('https://docs.google.com/spreadsheets/d/' + 
                   '1nLWtw-6VwOgKvADQEhwIa0s9zst_-_aY5CVGEoCoKpU' +
                   '/export?gid=508974035&format=csv')
display(normalizer_df)
final_cols = list(normalizer_df['var_final'])
var_types = list(normalizer_df['var_type'])

Unnamed: 0,var_final,var_2001,multiplier_2001,var_2011,multiplier_2011,var_2021,multiplier_2021,var_type
0,MSOA Code,MSOA11CD,,MSOA11CD,,MSOA11CD,,disc
1,MSOA Name,MSOA11NM,,MSOA11NM,,MSOA11NM,,disc
2,City Code,TCITY15CD,,TCITY15CD,,TCITY15CD,,disc
3,City Name,TCITY15NM,,TCITY15NM,,TCITY15NM,,disc
4,Residency: Average number of bedrooms per hous...,,,Average number of bedrooms per household,1.0,Average number of bedrooms per household,1.0,cont
...,...,...,...,...,...,...,...,...
212,Age: 60 to 64,Age 60 to 64,1.0,Age 60 to 64,1.0,Aged 60 to 64 years,1.0,cont
213,Age: 65 to 74,Age 65 to 74,1.0,Age 65 to 74,1.0,Age 65 to 74,1.0,cont
214,Age: 75 to 84,Age 75 to 84,1.0,Age 75 to 84,1.0,Age 75 to 84,1.0,cont
215,Age: 85 years and over,Aged 85 years and over,1.0,Aged 85 years and over,1.0,Aged 85 years and over,1.0,cont


In [58]:
normcols_2001 = list(normalizer_df['var_2001'])
multcols_2001 = list(normalizer_df['multiplier_2001'])

fin2001_df = pd.DataFrame()

for final_col, var_type, normcol, multcol in zip(final_cols, var_types, normcols_2001, multcols_2001):
    if var_type == 'disc':
        fin2001_df[final_col] = c2001_df[normcol]
    else:
        if not math.isnan(multcol):
            fin2001_df[final_col] = c2001_df[normcol] * multcol
        else:
            fin2001_df[final_col] = None

fin2001_df['Year'] = 2001
fin2001_df

Unnamed: 0,MSOA Code,MSOA Name,City Code,City Name,Residency: Average number of bedrooms per household,Sex: Females,Sex: Males,Residency: Lives in a communal establishment,Occupancy rating (rooms) of -1 or less,Passport: Africa,...,Age: 30 to 44,Age: 45 to 59,Age: 0 to 4,Age: 5 to 9,Age: 60 to 64,Age: 65 to 74,Age: 75 to 84,Age: 85 years and over,Age: 16 years and over,Year
0,E02000001,City of London 001,J01000055,London,,3353.0,3832.0,324.0,743.0,,...,1977.0,1622.0,250.0,207.0,306.0,514.0,322.0,126.0,6515.0,2001
1,E02000101,Brent 009,J01000055,London,,3955.0,3476.0,143.0,330.0,,...,1671.0,1261.0,449.0,487.0,305.0,530.0,392.0,176.0,5902.0,2001
2,E02000002,Barking and Dagenham 001,J01000055,London,,3366.0,2871.0,20.0,150.0,,...,1277.0,894.0,489.0,430.0,216.0,665.0,528.0,161.0,4811.0,2001
3,E02000003,Barking and Dagenham 002,J01000055,London,,4788.0,4402.0,6.0,165.5,,...,2227.0,1575.0,614.0,584.0,377.0,730.0,529.0,162.0,7284.0,2001
4,E02000102,Brent 010,J01000055,London,,3924.0,3729.0,0.0,379.5,,...,1993.0,1045.0,575.0,527.0,330.0,443.0,208.0,70.0,5833.0,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,W02000410,Cardiff 044,J01000020,Cardiff,,3566.0,3486.0,3.0,105.5,,...,1669.0,948.0,602.0,557.0,262.0,450.0,294.0,73.0,5225.0,2001
3272,W02000411,Cardiff 045,J01000020,Cardiff,,3045.0,2861.0,37.0,101.0,,...,1339.0,1013.0,430.0,486.0,247.0,388.0,297.0,67.0,4362.0,2001
3273,W02000412,Cardiff 046,J01000020,Cardiff,,3011.0,2842.0,29.0,99.0,,...,1429.0,906.0,439.0,440.0,234.0,427.0,265.0,82.0,4480.0,2001
3274,W02000422,Cardiff 048,J01000020,Cardiff,,2836.0,3113.0,180.0,186.5,,...,1415.0,731.0,384.0,355.0,245.0,380.0,261.0,93.0,4815.0,2001


In [59]:
normcols_2011 = list(normalizer_df['var_2011'])
multcols_2011 = list(normalizer_df['multiplier_2011'])

fin2011_df = pd.DataFrame()

for final_col, var_type, normcol, multcol in zip(final_cols, var_types, normcols_2011, multcols_2011):
    if var_type == 'disc':
        fin2011_df[final_col] = c2011_df[normcol]
    else:
        if not math.isnan(multcol):
            fin2011_df[final_col] = c2011_df[normcol] * multcol
        else:
            fin2011_df[final_col] = 0

fin2011_df['Year'] = 2011
fin2011_df

Unnamed: 0,MSOA Code,MSOA Name,City Code,City Name,Residency: Average number of bedrooms per household,Sex: Females,Sex: Males,Residency: Lives in a communal establishment,Occupancy rating (rooms) of -1 or less,Passport: Africa,...,Age: 30 to 44,Age: 45 to 59,Age: 0 to 4,Age: 5 to 9,Age: 60 to 64,Age: 65 to 74,Age: 75 to 84,Age: 85 years and over,Age: 16 years and over,Year
0,E02000053,Barnet 030,J01000055,London,2.2,4277.0,4420.0,447.0,616.5,457.0,...,2203.0,1271.0,670.0,427.0,304.0,398.0,235.0,74.0,3656.0,2011
1,E02000209,Croydon 016,J01000055,London,2.5,4310.0,4104.0,3.0,296.5,302.0,...,2067.0,1495.0,797.0,693.0,259.0,348.0,227.0,60.0,3317.0,2011
2,E02000054,Barnet 031,J01000055,London,2.6,3654.0,3552.0,31.0,341.5,333.0,...,1797.0,1218.0,504.0,375.0,321.0,412.0,262.0,94.0,3183.0,2011
3,E02000055,Barnet 032,J01000055,London,2.6,5616.0,5514.0,237.0,553.0,421.0,...,2488.0,1517.0,798.0,608.0,423.0,628.0,530.0,317.0,4977.0,2011
4,E02000056,Barnet 033,J01000055,London,3.4,4893.0,4319.0,170.0,133.5,234.0,...,2039.0,1623.0,669.0,648.0,559.0,709.0,456.0,366.0,4229.0,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,3.0,4795.0,4585.0,181.0,41.5,8.0,...,1685.0,2111.0,413.0,476.0,775.0,1100.0,683.0,274.0,4447.0,2011
3256,E02005068,Maidstone 001,J01000022,Chatham,3.2,3552.0,3490.0,75.0,44.5,35.0,...,1536.0,1627.0,401.0,438.0,480.0,538.0,256.0,104.0,3011.0,2011
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,3.0,4007.0,3841.0,69.0,61.5,46.0,...,1433.0,1648.0,354.0,435.0,577.0,937.0,623.0,262.0,3767.0,2011
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,2.7,5105.0,5022.0,144.0,156.0,138.0,...,2380.0,1791.0,760.0,600.0,458.0,590.0,475.0,241.0,4411.0,2011


In [60]:
normcols_2021 = list(normalizer_df['var_2021'])
multcols_2021 = list(normalizer_df['multiplier_2021'])

fin2021_df = pd.DataFrame()

for final_col, var_type, normcol, multcol in zip(final_cols, var_types, normcols_2021, multcols_2021):
    if var_type == 'disc':
        fin2021_df[final_col] = c2021_df[normcol]
    else:
        if not math.isnan(multcol):
            fin2021_df[final_col] = c2021_df[normcol] * multcol
        else:
            fin2021_df[final_col] = None

fin2021_df['Year'] = 2021
fin2021_df

Unnamed: 0,MSOA Code,MSOA Name,City Code,City Name,Residency: Average number of bedrooms per household,Sex: Females,Sex: Males,Residency: Lives in a communal establishment,Occupancy rating (rooms) of -1 or less,Passport: Africa,...,Age: 30 to 44,Age: 45 to 59,Age: 0 to 4,Age: 5 to 9,Age: 60 to 64,Age: 65 to 74,Age: 75 to 84,Age: 85 years and over,Age: 16 years and over,Year
0,E02000308,Enfield 032,J01000055,London,2.858648,4433.0,4326.0,16.0,129.0,622.0,...,2210.0,1689.0,532.0,494.0,429.0,562.0,364.0,138.0,7048.0,2021
1,E02000309,Enfield 033,J01000055,London,2.581244,6050.0,5744.0,51.0,270.0,1876.0,...,2583.0,2251.0,875.0,1045.0,513.0,602.0,301.0,186.0,8797.0,2021
2,E02000311,Enfield 035,J01000055,London,2.815406,5378.0,5166.0,9.0,183.0,743.0,...,2390.0,2096.0,647.0,598.0,602.0,790.0,415.0,182.0,8511.0,2021
3,E02000312,Enfield 036,J01000055,London,2.600222,5754.0,5549.0,43.0,193.0,679.0,...,3207.0,2175.0,780.0,606.0,562.0,822.0,457.0,198.0,9317.0,2021
4,E02000313,Greenwich 001,J01000055,London,2.348998,3337.0,3152.0,10.0,111.0,1562.0,...,1349.0,1488.0,434.0,475.0,333.0,397.0,147.0,40.0,4863.0,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,Swansea 026,J01000098,Swansea,2.897117,4046.0,4771.0,181.0,44.0,247.0,...,1082.0,1021.0,205.0,205.0,299.0,526.0,354.0,186.0,8121.0,2021
3256,W02000194,Swansea 027,J01000098,Swansea,3.584056,3442.0,3095.0,1082.0,1.0,130.0,...,899.0,1017.0,221.0,287.0,387.0,773.0,566.0,308.0,5688.0,2021
3257,W02000196,Swansea 029,J01000098,Swansea,3.030243,3303.0,2975.0,72.0,9.0,48.0,...,977.0,1231.0,260.0,313.0,438.0,927.0,698.0,287.0,5313.0,2021
3258,W02000198,Swansea 031,J01000098,Swansea,3.428656,3782.0,3474.0,120.0,2.0,67.0,...,955.0,1554.0,243.0,305.0,551.0,1187.0,787.0,417.0,6233.0,2021


In [61]:
fin_df = pd.concat([fin2001_df, fin2011_df, fin2021_df])
id_cols = ["MSOA Code", "MSOA Name", "City Code", "City Name", "Year"]
val_cols = [col for col in list(fin_df.columns) if col not in id_cols]
val_cols = sorted(val_cols)
fin_cols = id_cols + val_cols
fin_df = fin_df[fin_cols]
fin_df = fin_df.groupby(id_cols).sum().reset_index()
fin_df.to_csv(r".\data\clean\normalized_all.csv", index=False)
fin_df

  fin_df = pd.concat([fin2001_df, fin2011_df, fin2021_df])


Unnamed: 0,MSOA Code,MSOA Name,City Code,City Name,Year,Age: 0 to 4,Age: 10 to 14,Age: 15 to 19,Age: 16 years and over,Age: 20 to 24,...,Unpaid Care: Provides no unpaid care,Vehicles: 1 car or van in household,Vehicles: 2 cars or vans in household,Vehicles: 3 or more cars or vans in household,Vehicles: No cars or vans in household,Welsh: Can speak Welsh,Welsh: Can speak and read but cannot write Welsh,Welsh: Can speak but cannot read or write Welsh,Welsh: No skills in Welsh,Welsh: Other combination of skills in Welsh
0,E02000001,City of London 001,J01000055,London,2001,250.0,186.0,206.0,6515.0,529.0,...,6587.0,1417.0,184.0,46.0,2691.0,0.0,0.0,0.0,0.0,0.0
1,E02000001,City of London 001,J01000055,London,2011,236.0,169.0,197.0,3159.0,545.0,...,6799.0,1100.0,173.0,69.0,3043.0,0.0,0.0,0.0,0.0,0.0
2,E02000001,City of London 001,J01000055,London,2021,213.0,174.0,216.0,8004.0,965.0,...,7876.0,954.0,123.0,43.0,3793.0,0.0,0.0,0.0,0.0,0.0
3,E02000002,Barking and Dagenham 001,J01000055,London,2001,489.0,423.0,382.0,4811.0,363.0,...,5624.0,1189.0,376.0,85.0,1084.0,0.0,0.0,0.0,0.0,0.0
4,E02000002,Barking and Dagenham 001,J01000055,London,2011,628.0,446.0,434.0,3026.0,424.0,...,6157.0,1186.0,424.0,83.0,1020.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9774,W02000422,Cardiff 048,J01000020,Cardiff,2011,294.0,66.0,110.0,2493.0,940.0,...,5747.0,2299.0,637.0,76.0,642.0,632.0,57.0,62.0,4857.0,107.0
9775,W02000422,Cardiff 048,J01000020,Cardiff,2021,365.0,133.0,124.0,6659.0,906.0,...,6684.0,2462.0,630.0,60.0,1063.0,746.0,31.0,68.0,6012.0,1050.0
9776,W02000423,Cardiff 049,J01000020,Cardiff,2001,384.0,334.0,330.0,4815.0,612.0,...,5504.0,1263.0,376.0,55.0,1263.0,0.0,39.0,95.0,4920.0,110.0
9777,W02000423,Cardiff 049,J01000020,Cardiff,2011,543.0,313.0,344.0,3378.0,967.0,...,7662.0,2006.0,515.0,69.0,1542.0,687.0,55.0,99.0,6792.0,186.0
