In [32]:
import pandas as pd
import glob


Create 2011 gentrification prediction dataset:

1. Get list of MSOAs for each city as base table
2. Merge census variables to base table
3. Merge additional tables to base table
4. On a copy of the base table from 1, label gentrification using 2011 & 2021 data (nevermind, doing PCA)

Freeman gentrification labeling method:

a census tract is (1) marked eligible if (a) housing construction is below a metropolitan median, and (b) income is below the median, and (c) the tract is located in a central city. It is then (2) marked as gentrified if, after the measurement period, (a) there is a greater increase in educational attainment compared to the median and (b) an increase in real housing prices


Data sources:

2011 construction:
no direct numbers at the lsoa or msoa level, only regional & country stats
can use c2011ks401ew & c2011ks401uk, which contains number of household dwellings, subtract 2001 numbers, to get new dwellings as a proxy

2011 income:
.\data\raw\govuk\1smallareaincomeestimatesdatatcm77420299.xls

2011 cities:
.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv

2011 educational attainment:
c2011ks501ew

2021 educational attainment:
c2021ts067

2011 house prices:
data\raw\ons\hpssamedianpricebymsoa.xlsx

2021 house prices:
data\raw\ons\hpssamedianpricebymsoa.xlsx

In [55]:
base2011_df = pd.read_csv(r'.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv')
base2011_df = base2011_df.dropna(axis=0, how='any', ignore_index=True)
base2011_df = base2011_df.drop(columns='FID')
base2011_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM
0,E02000053,Barnet 030,J01000055,London
1,E02000209,Croydon 016,J01000055,London
2,E02000054,Barnet 031,J01000055,London
3,E02000055,Barnet 032,J01000055,London
4,E02000056,Barnet 033,J01000055,London
...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle
3256,E02005068,Maidstone 001,J01000022,Chatham
3257,E02004497,Chelmsford 013,J01000023,Chelmsford
3258,E02004493,Chelmsford 009,J01000023,Chelmsford


In [34]:
metadata_2011 = pd.read_csv(r'.\data\raw\api\msoa\c2011\metadata.csv')
metadata_2011['clean_mnemonic'] = metadata_2011['mnemonic'].str.extract('(c2011ks\d*).*')
metadata_2011_mnemonics = list(set(metadata_2011['clean_mnemonic']))
len(metadata_2011_mnemonics)

35

In [35]:
c2011_dfs = []

for mnemonic in metadata_2011_mnemonics:

    root_dir = r'.\data\raw\api\msoa\c2011'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2011_dfs.append(mnemonic_df)
c2011_dfs[0]

CELL_NAME,GEOGRAPHY_CODE,All categories: Type of central heating in household,Average household size,Average number of bedrooms per household,Average number of rooms per household,Does have central heating,Does not have central heating,Occupancy rating (bedrooms) of -1 or less,Occupancy rating (rooms) of -1 or less
0,E02000001,4385.0,1.6,1.6,3.4,4196.0,189.0,258.0,1517.0
1,E02000002,2713.0,2.5,2.4,4.6,2645.0,68.0,255.0,400.0
2,E02000003,3834.0,2.6,2.6,4.9,3698.0,136.0,344.0,539.0
3,E02000004,2318.0,2.6,2.8,5.2,2283.0,35.0,130.0,188.0
4,E02000005,3183.0,2.7,2.5,4.7,3064.0,119.0,380.0,482.0
...,...,...,...,...,...,...,...,...,...
7196,W02000419,4653.0,2.3,2.7,5.4,4556.0,97.0,114.0,150.0
7197,W02000420,5577.0,2.4,3.2,6.5,5503.0,74.0,78.0,101.0
7198,W02000421,5027.0,2.3,3.1,6.3,4744.0,283.0,90.0,139.0
7199,W02000422,3654.0,1.7,1.8,3.4,3503.0,151.0,132.0,1113.0


In [36]:
c2011_dfs[0][c2011_dfs[0]['GEOGRAPHY_CODE'] == 'E02000053']

CELL_NAME,GEOGRAPHY_CODE,All categories: Type of central heating in household,Average household size,Average number of bedrooms per household,Average number of rooms per household,Does have central heating,Does not have central heating,Occupancy rating (bedrooms) of -1 or less,Occupancy rating (rooms) of -1 or less
51,E02000053,3244.0,2.5,2.2,4.1,3109.0,135.0,532.0,1233.0


In [37]:
c2011_census_df = base2011_df.copy()
for df in c2011_dfs:
    c2011_census_df = c2011_census_df.merge(df, how='left', left_on='MSOA11CD', right_on='GEOGRAPHY_CODE')
    c2011_census_df_cols = list(c2011_census_df.columns)
    c2011_census_df_dup_cols = [col for col in c2011_census_df_cols if '_x' in col or '_y' in col]
    c2011_census_df_dup_cols.append('GEOGRAPHY_CODE')
    c2011_census_df = c2011_census_df.drop(columns=c2011_census_df_dup_cols)
c2011_census_df = c2011_census_df.dropna(axis=1, how='all')
c2011_census_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All categories: Type of central heating in household,Average household size,Average number of bedrooms per household,Average number of rooms per household,Does have central heating,Does not have central heating,...,"1. Managers, directors and senior officials",2. Professional occupations,3. Associate professional and technical occupations,4. Administrative and secretarial occupations,5. Skilled trades occupations,"6. Caring, leisure and other service occupations",7. Sales and customer service occupations,8. Process plant and machine operatives,9. Elementary occupations,All categories: Occupation
0,E02000053,Barnet 030,J01000055,London,3244.0,2.5,2.2,4.1,3109.0,135.0,...,130.0,359.0,243.0,300.0,39.0,264.0,239.0,17.0,240.0,1831.0
1,E02000209,Croydon 016,J01000055,London,2919.0,2.9,2.5,4.9,2798.0,121.0,...,89.0,322.0,165.0,373.0,38.0,278.0,264.0,29.0,203.0,1761.0
2,E02000054,Barnet 031,J01000055,London,2774.0,2.6,2.6,4.8,2716.0,58.0,...,151.0,443.0,221.0,294.0,33.0,218.0,125.0,17.0,186.0,1688.0
3,E02000055,Barnet 032,J01000055,London,4179.0,2.6,2.6,4.9,4057.0,122.0,...,213.0,546.0,296.0,408.0,52.0,359.0,221.0,16.0,287.0,2398.0
4,E02000056,Barnet 033,J01000055,London,3390.0,2.7,3.4,6.5,3343.0,47.0,...,293.0,637.0,378.0,293.0,22.0,228.0,101.0,5.0,88.0,2045.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,4058.0,2.3,3.0,6.1,3989.0,69.0,...,187.0,413.0,212.0,482.0,69.0,344.0,304.0,55.0,199.0,2265.0
3256,E02005068,Maidstone 001,J01000022,Chatham,2706.0,2.6,3.2,6.2,2661.0,45.0,...,175.0,314.0,261.0,480.0,33.0,255.0,195.0,24.0,132.0,1869.0
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,3308.0,2.4,3.0,6.1,3229.0,79.0,...,138.0,356.0,246.0,455.0,34.0,229.0,164.0,25.0,146.0,1793.0
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,4209.0,2.4,2.7,5.4,4123.0,86.0,...,186.0,553.0,330.0,484.0,48.0,310.0,219.0,47.0,279.0,2456.0


In [38]:
income2011_df = pd.read_excel(r'.\data\raw\govuk\1smallareaincomeestimatesdatatcm77420299.xls', sheet_name='Total weekly income', header=4)
income2011_df = income2011_df[['MSOA code', 'Total weekly income (£)']]
income2011_df = income2011_df.dropna(axis=0, how='any')
income2011_df

c2011_income_df = c2011_census_df.copy()
c2011_income_df = c2011_income_df.merge(income2011_df, how='left', left_on='MSOA11CD', right_on='MSOA code')
c2011_income_df = c2011_income_df.drop(columns='MSOA code')
c2011_income_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All categories: Type of central heating in household,Average household size,Average number of bedrooms per household,Average number of rooms per household,Does have central heating,Does not have central heating,...,2. Professional occupations,3. Associate professional and technical occupations,4. Administrative and secretarial occupations,5. Skilled trades occupations,"6. Caring, leisure and other service occupations",7. Sales and customer service occupations,8. Process plant and machine operatives,9. Elementary occupations,All categories: Occupation,Total weekly income (£)
0,E02000053,Barnet 030,J01000055,London,3244.0,2.5,2.2,4.1,3109.0,135.0,...,359.0,243.0,300.0,39.0,264.0,239.0,17.0,240.0,1831.0,810.0
1,E02000209,Croydon 016,J01000055,London,2919.0,2.9,2.5,4.9,2798.0,121.0,...,322.0,165.0,373.0,38.0,278.0,264.0,29.0,203.0,1761.0,810.0
2,E02000054,Barnet 031,J01000055,London,2774.0,2.6,2.6,4.8,2716.0,58.0,...,443.0,221.0,294.0,33.0,218.0,125.0,17.0,186.0,1688.0,950.0
3,E02000055,Barnet 032,J01000055,London,4179.0,2.6,2.6,4.9,4057.0,122.0,...,546.0,296.0,408.0,52.0,359.0,221.0,16.0,287.0,2398.0,870.0
4,E02000056,Barnet 033,J01000055,London,3390.0,2.7,3.4,6.5,3343.0,47.0,...,637.0,378.0,293.0,22.0,228.0,101.0,5.0,88.0,2045.0,1300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,4058.0,2.3,3.0,6.1,3989.0,69.0,...,413.0,212.0,482.0,69.0,344.0,304.0,55.0,199.0,2265.0,690.0
3256,E02005068,Maidstone 001,J01000022,Chatham,2706.0,2.6,3.2,6.2,2661.0,45.0,...,314.0,261.0,480.0,33.0,255.0,195.0,24.0,132.0,1869.0,1020.0
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,3308.0,2.4,3.0,6.1,3229.0,79.0,...,356.0,246.0,455.0,34.0,229.0,164.0,25.0,146.0,1793.0,960.0
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,4209.0,2.4,2.7,5.4,4123.0,86.0,...,553.0,330.0,484.0,48.0,310.0,219.0,47.0,279.0,2456.0,940.0


In [48]:
housing2011_df = pd.read_excel(r'.\data\raw\ons\hpssamedianpricebymsoa.xlsx', sheet_name='1a', header=2)
housing2011_df = housing2011_df[['MSOA code', 'Year ending Dec 2011']]
housing2011_df = housing2011_df.rename(columns={'Year ending Dec 2011': 'Median house price'})
housing2011_df

c2011_housing_df = c2011_income_df.copy()
c2011_housing_df = c2011_housing_df.merge(housing2011_df, how='left', left_on='MSOA11CD', right_on='MSOA code')
c2011_housing_df = c2011_housing_df.drop(columns='MSOA code')
c2011_housing_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All categories: Type of central heating in household,Average household size,Average number of bedrooms per household,Average number of rooms per household,Does have central heating,Does not have central heating,...,3. Associate professional and technical occupations,4. Administrative and secretarial occupations,5. Skilled trades occupations,"6. Caring, leisure and other service occupations",7. Sales and customer service occupations,8. Process plant and machine operatives,9. Elementary occupations,All categories: Occupation,Total weekly income (£),Median house price
0,E02000053,Barnet 030,J01000055,London,3244.0,2.5,2.2,4.1,3109.0,135.0,...,243.0,300.0,39.0,264.0,239.0,17.0,240.0,1831.0,810.0,
1,E02000209,Croydon 016,J01000055,London,2919.0,2.9,2.5,4.9,2798.0,121.0,...,165.0,373.0,38.0,278.0,264.0,29.0,203.0,1761.0,810.0,191750
2,E02000054,Barnet 031,J01000055,London,2774.0,2.6,2.6,4.8,2716.0,58.0,...,221.0,294.0,33.0,218.0,125.0,17.0,186.0,1688.0,950.0,330000
3,E02000055,Barnet 032,J01000055,London,4179.0,2.6,2.6,4.9,4057.0,122.0,...,296.0,408.0,52.0,359.0,221.0,16.0,287.0,2398.0,870.0,325000
4,E02000056,Barnet 033,J01000055,London,3390.0,2.7,3.4,6.5,3343.0,47.0,...,378.0,293.0,22.0,228.0,101.0,5.0,88.0,2045.0,1300.0,890000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,E02003999,Carlisle 013,J01000021,Carlisle,4058.0,2.3,3.0,6.1,3989.0,69.0,...,212.0,482.0,69.0,344.0,304.0,55.0,199.0,2265.0,690.0,175000
3256,E02005068,Maidstone 001,J01000022,Chatham,2706.0,2.6,3.2,6.2,2661.0,45.0,...,261.0,480.0,33.0,255.0,195.0,24.0,132.0,1869.0,1020.0,196250
3257,E02004497,Chelmsford 013,J01000023,Chelmsford,3308.0,2.4,3.0,6.1,3229.0,79.0,...,246.0,455.0,34.0,229.0,164.0,25.0,146.0,1793.0,960.0,242000
3258,E02004493,Chelmsford 009,J01000023,Chelmsford,4209.0,2.4,2.7,5.4,4123.0,86.0,...,330.0,484.0,48.0,310.0,219.0,47.0,279.0,2456.0,940.0,202750


In [78]:
c2011_df = c2011_housing_df.copy()
c2011_df = c2011_df.dropna(axis=1, how='all')
c2011_df.to_csv(r'.\data\clean\c2011.csv', index=False)

In [56]:
base2001_df = pd.read_csv(r'.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2001)_to_Middle_Layer_Super_Output_Area_(2011)_to_Local_Authority_District_(2011)_Lookup_in_England_and_Wales.csv')
base2001_df = base2001_df[['MSOA01CD', 'MSOA11CD']]
base2001_df = base2001_df.merge(base2011_df, on='MSOA11CD')
base2001_df

Unnamed: 0,MSOA01CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM
0,E02000001,E02000001,City of London 001,J01000055,London
1,E02000101,E02000101,Brent 009,J01000055,London
2,E02000002,E02000002,Barking and Dagenham 001,J01000055,London
3,E02000003,E02000003,Barking and Dagenham 002,J01000055,London
4,E02000102,E02000102,Brent 010,J01000055,London
...,...,...,...,...,...
3271,W02000410,W02000410,Cardiff 044,J01000020,Cardiff
3272,W02000411,W02000411,Cardiff 045,J01000020,Cardiff
3273,W02000412,W02000412,Cardiff 046,J01000020,Cardiff
3274,W02000413,W02000422,Cardiff 048,J01000020,Cardiff


In [70]:
metadata_2001 = pd.read_csv(r'.\data\raw\api\msoa\c2001\metadata.csv')
metadata_2001['clean_mnemonic'] = metadata_2001['mnemonic'].str.extract('(ks.*)')
metadata_2001 = metadata_2001[metadata_2001['geoglevel'].str.contains('msoa')]
metadata_2001_mnemonics = list(set(metadata_2001['clean_mnemonic']))
len(metadata_2001_mnemonics)

32

In [72]:
c2001_dfs = []

for mnemonic in metadata_2001_mnemonics:

    root_dir = r'.\data\raw\api\msoa\c2001'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2001_dfs.append(mnemonic_df)
c2001_dfs[0]

CELL_NAME,GEOGRAPHY_CODE,All people aged 16-74,Full-time students: Age 18 to 74: Economically active: In employment,Full-time students: Age 18 to 74: Economically active: Unemployed,Full-time students: Age 18 to 74: Economically inactive,Highest level of qualification: Level 1 qualifications,Highest level of qualification: Level 2 qualifications,Highest level of qualification: Level 3 qualifications,Highest level of qualification: Level 4 qualifications and above,Highest level of qualification: Other qualifications,No qualifications,Schoolchildren and full-time students: Age 16 to 17,Schoolchildren and full-time students: Age 18 to 74
0,E02000001,6067.0,84.0,21.0,292.0,359.0,634.0,665.0,3647.0,155.0,607.0,65.0,397.0
1,E02000002,4122.0,39.0,8.0,55.0,859.0,700.0,186.0,371.0,297.0,1709.0,109.0,102.0
2,E02000003,6593.0,99.0,17.0,100.0,1381.0,1414.0,446.0,860.0,496.0,1996.0,174.0,216.0
3,E02000004,4310.0,40.0,4.0,45.0,839.0,890.0,269.0,443.0,342.0,1527.0,113.0,89.0
4,E02000005,5486.0,51.0,6.0,80.0,1079.0,1001.0,263.0,407.0,406.0,2330.0,164.0,137.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7189,W02000409,4105.0,37.0,7.0,67.0,674.0,740.0,226.0,578.0,292.0,1595.0,110.0,111.0
7190,W02000410,4858.0,103.0,15.0,221.0,687.0,785.0,365.0,869.0,301.0,1851.0,153.0,339.0
7191,W02000411,3998.0,48.0,4.0,59.0,663.0,678.0,173.0,393.0,304.0,1787.0,106.0,111.0
7192,W02000412,4133.0,62.0,14.0,83.0,671.0,689.0,247.0,698.0,286.0,1542.0,103.0,159.0


In [74]:
c2001_census_df = base2001_df.copy()
for df in c2001_dfs:
    c2001_census_df = c2001_census_df.merge(df, how='left', left_on='MSOA01CD', right_on='GEOGRAPHY_CODE')
    c2001_census_df_cols = list(c2001_census_df.columns)
    c2001_census_df_dup_cols = [col for col in c2001_census_df_cols if '_x' in col or '_y' in col]
    c2001_census_df_dup_cols.append('GEOGRAPHY_CODE')
    c2001_census_df = c2001_census_df.drop(columns=c2001_census_df_dup_cols)
c2001_census_df = c2001_census_df.dropna(axis=1, how='all')
c2001_census_df

Unnamed: 0,MSOA01CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All people aged 16-74,Full-time students: Age 18 to 74: Economically active: In employment,Full-time students: Age 18 to 74: Economically active: Unemployed,Full-time students: Age 18 to 74: Economically inactive,Highest level of qualification: Level 1 qualifications,...,All categories: Religion,Buddhist,Christian,Hindu,Jewish,Muslim,No religion,Other religion,Religion not stated,Sikh
0,E02000001,E02000001,City of London 001,J01000055,London,6067.0,84.0,21.0,292.0,359.0,...,7185.0,42.0,3950.0,109.0,226.0,403.0,1767.0,52.0,617.0,19.0
1,E02000101,E02000101,Brent 009,J01000055,London,5334.0,135.0,26.0,262.0,622.0,...,7431.0,81.0,3435.0,1068.0,333.0,1203.0,536.0,78.0,646.0,51.0
2,E02000002,E02000002,Barking and Dagenham 001,J01000055,London,4122.0,39.0,8.0,55.0,859.0,...,6237.0,21.0,4411.0,71.0,35.0,173.0,964.0,9.0,516.0,37.0
3,E02000003,E02000003,Barking and Dagenham 002,J01000055,London,6593.0,99.0,17.0,100.0,1381.0,...,9190.0,17.0,6602.0,243.0,66.0,280.0,1223.0,20.0,566.0,173.0
4,E02000102,E02000102,Brent 010,J01000055,London,5555.0,204.0,43.0,272.0,749.0,...,7653.0,92.0,3716.0,1467.0,40.0,1268.0,431.0,50.0,576.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,W02000410,W02000410,Cardiff 044,J01000020,Cardiff,4858.0,103.0,15.0,221.0,687.0,...,7052.0,23.0,3720.0,374.0,5.0,1159.0,1120.0,25.0,576.0,50.0
3272,W02000411,W02000411,Cardiff 045,J01000020,Cardiff,3998.0,48.0,4.0,59.0,663.0,...,5904.0,3.0,4050.0,15.0,0.0,89.0,1113.0,12.0,612.0,10.0
3273,W02000412,W02000412,Cardiff 046,J01000020,Cardiff,4133.0,62.0,14.0,83.0,671.0,...,5853.0,28.0,3657.0,111.0,0.0,558.0,957.0,17.0,479.0,46.0
3274,W02000413,W02000422,Cardiff 048,J01000020,Cardiff,4461.0,61.0,10.0,147.0,499.0,...,5949.0,18.0,3116.0,52.0,7.0,928.0,1122.0,20.0,657.0,29.0


In [76]:
housing2001_df = pd.read_excel(r'.\data\raw\ons\hpssamedianpricebymsoa.xlsx', sheet_name='1a', header=2)
housing2001_df = housing2001_df[['MSOA code', 'Year ending Dec 2001']]
housing2001_df = housing2001_df.rename(columns={'Year ending Dec 2001': 'Median house price'})
housing2001_df

c2001_housing_df = c2001_census_df.copy()
c2001_housing_df = c2001_housing_df.merge(housing2001_df, how='left', left_on='MSOA01CD', right_on='MSOA code')
c2001_housing_df = c2001_housing_df.drop(columns='MSOA code')
c2001_housing_df

Unnamed: 0,MSOA01CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,All people aged 16-74,Full-time students: Age 18 to 74: Economically active: In employment,Full-time students: Age 18 to 74: Economically active: Unemployed,Full-time students: Age 18 to 74: Economically inactive,Highest level of qualification: Level 1 qualifications,...,Buddhist,Christian,Hindu,Jewish,Muslim,No religion,Other religion,Religion not stated,Sikh,Median house price
0,E02000001,E02000001,City of London 001,J01000055,London,6067.0,84.0,21.0,292.0,359.0,...,42.0,3950.0,109.0,226.0,403.0,1767.0,52.0,617.0,19.0,237500
1,E02000101,E02000101,Brent 009,J01000055,London,5334.0,135.0,26.0,262.0,622.0,...,81.0,3435.0,1068.0,333.0,1203.0,536.0,78.0,646.0,51.0,130000
2,E02000002,E02000002,Barking and Dagenham 001,J01000055,London,4122.0,39.0,8.0,55.0,859.0,...,21.0,4411.0,71.0,35.0,173.0,964.0,9.0,516.0,37.0,95000
3,E02000003,E02000003,Barking and Dagenham 002,J01000055,London,6593.0,99.0,17.0,100.0,1381.0,...,17.0,6602.0,243.0,66.0,280.0,1223.0,20.0,566.0,173.0,115000
4,E02000102,E02000102,Brent 010,J01000055,London,5555.0,204.0,43.0,272.0,749.0,...,92.0,3716.0,1467.0,40.0,1268.0,431.0,50.0,576.0,13.0,115500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,W02000410,W02000410,Cardiff 044,J01000020,Cardiff,4858.0,103.0,15.0,221.0,687.0,...,23.0,3720.0,374.0,5.0,1159.0,1120.0,25.0,576.0,50.0,65000
3272,W02000411,W02000411,Cardiff 045,J01000020,Cardiff,3998.0,48.0,4.0,59.0,663.0,...,3.0,4050.0,15.0,0.0,89.0,1113.0,12.0,612.0,10.0,45000
3273,W02000412,W02000412,Cardiff 046,J01000020,Cardiff,4133.0,62.0,14.0,83.0,671.0,...,28.0,3657.0,111.0,0.0,558.0,957.0,17.0,479.0,46.0,71000
3274,W02000413,W02000422,Cardiff 048,J01000020,Cardiff,4461.0,61.0,10.0,147.0,499.0,...,18.0,3116.0,52.0,7.0,928.0,1122.0,20.0,657.0,29.0,


In [79]:
c2001_df = c2001_housing_df.copy()
c2001_df = c2001_df.dropna(axis=1, how='all')
c2001_df.to_csv(r'.\data\clean\c2001.csv', index=False)

In [82]:
base2021_df = pd.read_csv(r'.\data\raw\geoportal\MSOA_(2011)_to_MSOA_(2021)_to_Local_Authority_District_(2022)_Lookup_for_England_and_Wales_(Version_2).csv')
base2021_df = base2021_df[['MSOA21CD', 'MSOA11CD']]
base2021_df = base2021_df.merge(base2011_df, on='MSOA11CD')
base2021_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM
0,E02000308,E02000308,Enfield 032,J01000055,London
1,E02000309,E02000309,Enfield 033,J01000055,London
2,E02000311,E02000311,Enfield 035,J01000055,London
3,E02000312,E02000312,Enfield 036,J01000055,London
4,E02000313,E02000313,Greenwich 001,J01000055,London
...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea


In [91]:
metadata_2021 = pd.read_csv(r'.\data\raw\api\msoa\c2021\metadata.csv')
metadata_2021['clean_mnemonic'] = metadata_2021['mnemonic'].str.extract('(c2021ts.*)')
metadata_2021 = metadata_2021[metadata_2021['geoglevel'].str.contains('msoa')]
metadata_2021_mnemonics = list(set(metadata_2021['clean_mnemonic']))
len(metadata_2021_mnemonics)

68

In [92]:
c2021_dfs = []

for mnemonic in metadata_2021_mnemonics:

    root_dir = r'.\data\raw\api\msoa\c2021'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2021_dfs.append(mnemonic_df)
c2021_dfs[0]

C2021_AGE_19_NAME,GEOGRAPHY_CODE,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,Aged 35 to 39 years,Aged 4 years and under,Aged 40 to 44 years,Aged 45 to 49 years,Aged 5 to 9 years,Aged 50 to 54 years,Aged 55 to 59 years,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over,Total
0,E02000001,174.0,216.0,965.0,1213.0,1002.0,726.0,213.0,547.0,544.0,159.0,640.0,517.0,458.0,393.0,318.0,210.0,164.0,121.0,8580.0
1,E02000002,768.0,618.0,410.0,518.0,667.0,651.0,653.0,590.0,540.0,751.0,463.0,411.0,317.0,234.0,195.0,193.0,149.0,158.0,8286.0
2,E02000003,772.0,714.0,737.0,865.0,984.0,956.0,951.0,848.0,685.0,837.0,724.0,648.0,547.0,359.0,310.0,220.0,217.0,165.0,11539.0
3,E02000004,401.0,391.0,424.0,491.0,515.0,480.0,394.0,433.0,416.0,443.0,455.0,403.0,393.0,260.0,230.0,184.0,152.0,173.0,6638.0
4,E02000005,983.0,734.0,621.0,690.0,863.0,1012.0,975.0,938.0,741.0,977.0,684.0,603.0,378.0,265.0,225.0,153.0,134.0,106.0,11082.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7259,W02000424,311.0,324.0,281.0,289.0,314.0,324.0,242.0,335.0,443.0,285.0,557.0,508.0,492.0,480.0,546.0,430.0,265.0,257.0,6683.0
7260,W02000425,432.0,385.0,457.0,554.0,663.0,587.0,274.0,574.0,664.0,373.0,679.0,656.0,475.0,452.0,418.0,339.0,180.0,184.0,8346.0
7261,W02000426,572.0,488.0,467.0,482.0,492.0,549.0,450.0,607.0,613.0,575.0,677.0,726.0,639.0,636.0,661.0,544.0,382.0,333.0,9893.0
7262,W02000427,717.0,604.0,587.0,709.0,595.0,636.0,507.0,662.0,691.0,633.0,820.0,791.0,779.0,680.0,686.0,464.0,298.0,285.0,11144.0


In [94]:
c2021_census_df = base2021_df.copy()
for df in c2021_dfs:
    c2021_census_df = c2021_census_df.merge(df, how='left', left_on='MSOA21CD', right_on='GEOGRAPHY_CODE')
    c2021_census_df_cols = list(c2021_census_df.columns)
    c2021_census_df_dup_cols = [col for col in c2021_census_df_cols if '_x' in col or '_y' in col]
    c2021_census_df_dup_cols.append('GEOGRAPHY_CODE')
    c2021_census_df = c2021_census_df.drop(columns=c2021_census_df_dup_cols)
c2021_census_df = c2021_census_df.dropna(axis=1, how='all')
c2021_census_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,1 bedroom,2 bedrooms,3 bedrooms,4 or more bedrooms,Family member or partner of staff or owner,...,1 person disabled under the Equality Act in household,2 or more people disabled under the Equality Act in household,No people disabled under the Equality Act in household,Total: All households,Usual residents per square kilometre,Address one year ago is student term-time or boarding school address in the UK,Address one year ago is the same as the address of enumeration,Migrant from outside the UK: Address one year ago was outside the UK,Migrant from within the UK: Address one year ago was in the UK,Total: All usual residents
0,E02000308,E02000308,Enfield 032,J01000055,London,488.0,832.0,1230.0,676.0,0.0,...,715.0,205.0,2306.0,3226.0,7634.4,11.0,7620.0,121.0,908.0,8660.0
1,E02000309,E02000309,Enfield 033,J01000055,London,556.0,1498.0,1464.0,470.0,0.0,...,892.0,246.0,2850.0,3988.0,7239.6,16.0,10703.0,139.0,795.0,11653.0
2,E02000311,E02000311,Enfield 035,J01000055,London,530.0,827.0,1670.0,608.0,0.0,...,816.0,246.0,2573.0,3635.0,7441.1,47.0,9397.0,178.0,817.0,10439.0
3,E02000312,E02000312,Enfield 036,J01000055,London,1145.0,1298.0,1159.0,894.0,0.0,...,924.0,197.0,3374.0,4495.0,8626.3,20.0,9764.0,161.0,1205.0,11150.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,818.0,592.0,917.0,269.0,0.0,...,635.0,161.0,1800.0,2596.0,8880.5,26.0,5960.0,49.0,360.0,6395.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,853.0,817.0,781.0,1084.0,0.0,...,1052.0,235.0,2251.0,3538.0,8393.9,1448.0,4930.0,232.0,2171.0,8781.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,154.0,368.0,854.0,1045.0,0.0,...,661.0,151.0,1609.0,2421.0,1668.1,97.0,5000.0,165.0,1240.0,6502.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,272.0,761.0,1278.0,698.0,0.0,...,931.0,213.0,1865.0,3009.0,2529.1,23.0,5742.0,18.0,463.0,6246.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,196.0,755.0,1135.0,1299.0,0.0,...,866.0,170.0,2349.0,3385.0,1644.9,39.0,6496.0,46.0,632.0,7213.0


In [95]:
income2021_df = pd.read_excel(r'.\data\raw\ons\saiefy1920finalqaddownload280923.xlsx', sheet_name='Total annual income', header=4)
income2021_df = income2021_df[['MSOA code', 'Total annual income (£)']]
income2021_df = income2021_df.dropna(axis=0, how='any')
income2021_df

c2021_income_df = c2021_census_df.copy()
c2021_income_df = c2021_income_df.merge(income2021_df, how='left', left_on='MSOA21CD', right_on='MSOA code')
c2021_income_df = c2021_income_df.drop(columns='MSOA code')
c2021_income_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,1 bedroom,2 bedrooms,3 bedrooms,4 or more bedrooms,Family member or partner of staff or owner,...,2 or more people disabled under the Equality Act in household,No people disabled under the Equality Act in household,Total: All households,Usual residents per square kilometre,Address one year ago is student term-time or boarding school address in the UK,Address one year ago is the same as the address of enumeration,Migrant from outside the UK: Address one year ago was outside the UK,Migrant from within the UK: Address one year ago was in the UK,Total: All usual residents,Total annual income (£)
0,E02000308,E02000308,Enfield 032,J01000055,London,488.0,832.0,1230.0,676.0,0.0,...,205.0,2306.0,3226.0,7634.4,11.0,7620.0,121.0,908.0,8660.0,57000.0
1,E02000309,E02000309,Enfield 033,J01000055,London,556.0,1498.0,1464.0,470.0,0.0,...,246.0,2850.0,3988.0,7239.6,16.0,10703.0,139.0,795.0,11653.0,42500.0
2,E02000311,E02000311,Enfield 035,J01000055,London,530.0,827.0,1670.0,608.0,0.0,...,246.0,2573.0,3635.0,7441.1,47.0,9397.0,178.0,817.0,10439.0,52800.0
3,E02000312,E02000312,Enfield 036,J01000055,London,1145.0,1298.0,1159.0,894.0,0.0,...,197.0,3374.0,4495.0,8626.3,20.0,9764.0,161.0,1205.0,11150.0,59400.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,818.0,592.0,917.0,269.0,0.0,...,161.0,1800.0,2596.0,8880.5,26.0,5960.0,49.0,360.0,6395.0,41400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,853.0,817.0,781.0,1084.0,0.0,...,235.0,2251.0,3538.0,8393.9,1448.0,4930.0,232.0,2171.0,8781.0,31900.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,154.0,368.0,854.0,1045.0,0.0,...,151.0,1609.0,2421.0,1668.1,97.0,5000.0,165.0,1240.0,6502.0,47600.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,272.0,761.0,1278.0,698.0,0.0,...,213.0,1865.0,3009.0,2529.1,23.0,5742.0,18.0,463.0,6246.0,39200.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,196.0,755.0,1135.0,1299.0,0.0,...,170.0,2349.0,3385.0,1644.9,39.0,6496.0,46.0,632.0,7213.0,47800.0


In [96]:
housing2021_df = pd.read_excel(r'.\data\raw\ons\hpssamedianpricebymsoa.xlsx', sheet_name='1a', header=2)
housing2021_df = housing2021_df[['MSOA code', 'Year ending Dec 2021']]
housing2021_df = housing2021_df.rename(columns={'Year ending Dec 2021': 'Median house price'})
housing2021_df

c2021_housing_df = c2021_census_df.copy()
c2021_housing_df = c2021_housing_df.merge(housing2021_df, how='left', left_on='MSOA21CD', right_on='MSOA code')
c2021_housing_df = c2021_housing_df.drop(columns='MSOA code')
c2021_housing_df

Unnamed: 0,MSOA21CD,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,1 bedroom,2 bedrooms,3 bedrooms,4 or more bedrooms,Family member or partner of staff or owner,...,2 or more people disabled under the Equality Act in household,No people disabled under the Equality Act in household,Total: All households,Usual residents per square kilometre,Address one year ago is student term-time or boarding school address in the UK,Address one year ago is the same as the address of enumeration,Migrant from outside the UK: Address one year ago was outside the UK,Migrant from within the UK: Address one year ago was in the UK,Total: All usual residents,Median house price
0,E02000308,E02000308,Enfield 032,J01000055,London,488.0,832.0,1230.0,676.0,0.0,...,205.0,2306.0,3226.0,7634.4,11.0,7620.0,121.0,908.0,8660.0,468500.0
1,E02000309,E02000309,Enfield 033,J01000055,London,556.0,1498.0,1464.0,470.0,0.0,...,246.0,2850.0,3988.0,7239.6,16.0,10703.0,139.0,795.0,11653.0,382500.0
2,E02000311,E02000311,Enfield 035,J01000055,London,530.0,827.0,1670.0,608.0,0.0,...,246.0,2573.0,3635.0,7441.1,47.0,9397.0,178.0,817.0,10439.0,469975.0
3,E02000312,E02000312,Enfield 036,J01000055,London,1145.0,1298.0,1159.0,894.0,0.0,...,197.0,3374.0,4495.0,8626.3,20.0,9764.0,161.0,1205.0,11150.0,577500.0
4,E02000313,E02000313,Greenwich 001,J01000055,London,818.0,592.0,917.0,269.0,0.0,...,161.0,1800.0,2596.0,8880.5,26.0,5960.0,49.0,360.0,6395.0,277500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,W02000193,W02000193,Swansea 026,J01000098,Swansea,853.0,817.0,781.0,1084.0,0.0,...,235.0,2251.0,3538.0,8393.9,1448.0,4930.0,232.0,2171.0,8781.0,157750.0
3256,W02000194,W02000194,Swansea 027,J01000098,Swansea,154.0,368.0,854.0,1045.0,0.0,...,151.0,1609.0,2421.0,1668.1,97.0,5000.0,165.0,1240.0,6502.0,324000.0
3257,W02000196,W02000196,Swansea 029,J01000098,Swansea,272.0,761.0,1278.0,698.0,0.0,...,213.0,1865.0,3009.0,2529.1,23.0,5742.0,18.0,463.0,6246.0,266000.0
3258,W02000198,W02000198,Swansea 031,J01000098,Swansea,196.0,755.0,1135.0,1299.0,0.0,...,170.0,2349.0,3385.0,1644.9,39.0,6496.0,46.0,632.0,7213.0,362500.0


In [97]:
c2021_df = c2021_housing_df.copy()
c2021_df = c2021_df.dropna(axis=1, how='all')
c2021_df.to_csv(r'.\data\clean\c2021.csv', index=False)