In [14]:
import pandas as pd
import glob


Create 2011 gentrification prediction dataset:

1. Get list of MSOAs for each city as base table
2. Merge census variables to base table
3. On a copy of the base table from 1, label gentrification using 2011 & 2021 data

Freeman gentrification labeling method:

a census tract is (1) marked eligible if (a) housing construction is below a metropolitan median, and (b) income is below the median, and (c) the tract is located in a central city. It is then (2) marked as gentrified if, after the measurement period, (a) there is a greater increase in educational attainment compared to the median and (b) an increase in real housing prices


Data sources:

2011 construction:
no direct numbers at the lsoa or msoa level, only regional & country stats
can use c2011ks401ew & c2011ks401uk, which contains number of household dwellings, subtract 2001 numbers, to get new dwellings as a proxy

2011 income:
.\data\raw\govuk\1smallareaincomeestimatesdatatcm77420299.xls

2011 cities:
.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv

2011 educational attainment:
c2011ks501ew

2021 educational attainment:
c2021ts067

2011 house prices:
data\raw\ons\hpssamedianpricebymsoa.xlsx

2021 house prices:
data\raw\ons\hpssamedianpricebymsoa.xlsx

In [2]:
base_df = pd.read_csv(r'.\data\raw\geoportal\Middle_Layer_Super_Output_Area_(2011)_to_Major_Towns_and_Cities_(December_2015)_Lookup_in_England_and_Wales.csv')
base_df

Unnamed: 0,MSOA11CD,MSOA11NM,TCITY15CD,TCITY15NM,FID
0,E02000053,Barnet 030,J01000055,London,1
1,E02000209,Croydon 016,J01000055,London,2
2,E02000054,Barnet 031,J01000055,London,3
3,E02000055,Barnet 032,J01000055,London,4
4,E02000056,Barnet 033,J01000055,London,5
...,...,...,...,...,...
7196,E02003999,Carlisle 013,J01000021,Carlisle,7197
7197,E02005068,Maidstone 001,J01000022,Chatham,7198
7198,E02004497,Chelmsford 013,J01000023,Chelmsford,7199
7199,E02004493,Chelmsford 009,J01000023,Chelmsford,7200


In [29]:
metadata_2011 = pd.read_csv(r'.\data\raw\api\msoa\c2011\metadata.csv')
metadata_2011['clean_mnemonic'] = metadata_2011['mnemonic'].str.extract('(c2011ks\d*).*')
metadata_2011_mnemonics = list(set(metadata_2011['clean_mnemonic']))
len(metadata_2011_mnemonics)

35

In [60]:
c2011_dfs = []

for mnemonic in metadata_2011_mnemonics:

    root_dir = r'.\data\raw\api\msoa\c2011'
    mnemonic_tables = glob.glob(mnemonic+'*', root_dir=root_dir)
    mnemonic_dfs = []

    for table in mnemonic_tables:
        df = pd.read_csv(rf"{root_dir}\{table}")
        mnemonic_dfs.append(df)

    mnemonic_df = pd.concat(mnemonic_dfs)

    mnemonic_df = mnemonic_df.drop(columns=['Unnamed: 0', 'GEOGRAPHY_TYPE', 'GEOGRAPHY_TYPECODE', 'DATE'])
    mnemonic_df['OBS_VALUE'] = mnemonic_df['OBS_VALUE'].astype(float)
    mnemonic_df = mnemonic_df[mnemonic_df['OBS_VALUE'] > 0]
    cols = list(mnemonic_df.columns)
    var_col = [col for col in cols if col not in ['GEOGRAPHY_CODE', 'OBS_VALUE']][0]
    mnemonic_df = mnemonic_df.pivot_table(index='GEOGRAPHY_CODE', columns=var_col, values='OBS_VALUE', aggfunc='mean', fill_value=0)
    mnemonic_df = mnemonic_df.reset_index(drop=False)
    c2011_dfs.append(mnemonic_df)
c2011_dfs[0]

CELL_NAME,GEOGRAPHY_CODE,All households,Living rent free,Owned,Owned: Owned outright,Owned: Owned with a mortgage or loan,Private rented,Private rented: Other,Private rented: Private landlord or letting agency,Shared ownership (part owned and part rented),Social rented,Social rented: Other,Social rented: Rented from council (Local Authority)
0,E02000001,4385.0,219.0,1855.0,1093.0,762.0,1573.0,122.0,1451.0,13.0,725.0,267.0,458.0
1,E02000002,2713.0,34.0,1259.0,596.0,663.0,269.0,14.0,255.0,18.0,1133.0,239.0,894.0
2,E02000003,3834.0,23.0,2501.0,1028.0,1473.0,830.0,42.0,788.0,34.0,446.0,87.0,359.0
3,E02000004,2318.0,16.0,1687.0,718.0,969.0,228.0,15.0,213.0,16.0,371.0,103.0,268.0
4,E02000005,3183.0,33.0,1857.0,711.0,1146.0,482.0,26.0,456.0,18.0,793.0,91.0,702.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7196,W02000419,4653.0,58.0,3531.0,1796.0,1735.0,588.0,54.0,534.0,56.0,420.0,104.0,316.0
7197,W02000420,5577.0,104.0,4574.0,2365.0,2209.0,530.0,58.0,472.0,15.0,354.0,27.0,327.0
7198,W02000421,5027.0,128.0,3877.0,2496.0,1381.0,780.0,91.0,689.0,13.0,229.0,157.0,72.0
7199,W02000422,3654.0,29.0,1172.0,282.0,890.0,2185.0,99.0,2086.0,54.0,214.0,193.0,21.0


In [57]:
c2011_dfs[0][c2011_dfs[0]['GEOGRAPHY_CODE'] == 'E02000053']

CELL_NAME,GEOGRAPHY_CODE,All households,Living rent free,Owned,Owned: Owned outright,Owned: Owned with a mortgage or loan,Private rented,Private rented: Other,Private rented: Private landlord or letting agency,Shared ownership (part owned and part rented),Social rented,Social rented: Other,Social rented: Rented from council (Local Authority)
51,E02000053,3244.0,43.0,1399.0,615.0,784.0,1206.0,55.0,1151.0,94.0,502.0,307.0,195.0


In [72]:
c2011_df = base_df.copy()
for df in c2011_dfs:
    c2011_df = c2011_df.merge(df, how='left', left_on='MSOA11CD', right_on='GEOGRAPHY_CODE')
    c2011_df_cols = list(c2011_df.columns)
    c2011_df_dup_cols = [col for col in c2011_df_cols if '_x' in col or '_y' in col]
    c2011_df_dup_cols.append('GEOGRAPHY_CODE')
    c2011_df = c2011_df.drop(columns=c2011_df_dup_cols)
c2011_df = c2011_df.dropna(axis=1, how='all')
c2011_df.to_csv(r'.\data\clean\c2011.csv')

In [75]:
len(list(c2011_df.columns))

381