In [1]:
import os
import pandas as pd
import geopandas as gpd

In [2]:
district_mapping_2011_census = {
    # match district names to the 2011 census names
    "Hackney and City of London": ["Hackney", "City of London"],
    "Cornwall and Isles of Scilly": ["Cornwall", "Isles of Scilly"],
    "Bournemouth, Christchurch and Poole": ["Bournemouth", "Christchurch", "Poole"],
    "Dorset": ["Weymouth and Portland", "West Dorset", "North Dorset", "Purbeck", "East Dorset"],
    "East Suffolk": ["Suffolk Coastal", "Waveney"],
    "West Suffolk": ["Forest Heath", "St Edmundsbury"],
    "Somerset West and Taunton": ["West Somerset", "Taunton Deane"],
    "Antrim and Newtownabbey": ["Antrim", "Newtownabbey"],
    "Armagh City, Banbridge and Craigavon": ["Armagh", "Banbridge", "Craigavon"],
    "Causeway Coast and Glens": ['Ballymoney', 'Coleraine', 'Limavady', 'Moyle'],
    "Derry City and Strabane": ["Derry", "Strabane"],
    "Fermanagh and Omagh": ["Fermanagh", "Omagh"],
    "Lisburn and Castlereagh": ["Lisburn", "Castlereagh"],
    "Mid and East Antrim": ["Ballymena", "Larne", "Carrickfergus"],
    "Mid Ulster": ["Cookstown", "Dungannon", ""],
    "Newry, Mourne and Down": ["Newry and Mourne", "Down"],
    "Ards and North Down": ["Ards", "North Down"],
    "Rhondda Cynon Taf": ["Rhondda Cynon Taff"]
}


def update_census_districts(df):
    df = df.copy()
    new_rows = []
    for new_district in district_mapping_2011_census:
        old_districts = district_mapping_2011_census[new_district]
        for old_district in old_districts:
            df['geography'] = df['geography'].replace(old_district, new_district)
    df = df.drop(['date', 'geography code'], axis=1)
    df = df.groupby(['geography']).sum().reset_index()
    return df

In [None]:
age_2011 = pd.read_csv(os.path.join("census", "2011", "Age (UK).csv"))
age_2011 = update_census_districts(age_2011)
young_variables = ["Age: Age 0 to 4; Rural Urban: Total; measures: Value",
                   "Age: Age 5 to 7; Rural Urban: Total; measures: Value",
                   "Age: Age 8 to 9; Rural Urban: Total; measures: Value",
                   "Age: Age 10 to 14; Rural Urban: Total; measures: Value",
                   "Age: Age 15; Rural Urban: Total; measures: Value",
                   "Age: Age 16 to 17; Rural Urban: Total; measures: Value",
                   "Age: Age 18 to 19; Rural Urban: Total; measures: Value"
                   ]
old_variables = ["Age: Age 65 to 74; Rural Urban: Total; measures: Value",
                 "Age: Age 75 to 84; Rural Urban: Total; measures: Value",
                 "Age: Age 85 to 89; Rural Urban: Total; measures: Value",
                 "Age: Age 90 and over; Rural Urban: Total; measures: Value"]
age_2011["young_percent"] = age_2011[young_variables].sum(axis=1) / age_2011['Age: All usual residents; Rural Urban: Total; measures: Value']
age_2011["old_percent"] = age_2011[old_variables].sum(axis=1) / age_2011['Age: All usual residents; Rural Urban: Total; measures: Value']
age_2011 = age_2011[['geography', 'young_percent', 'old_percent']]
age_2011.head()

In [None]:
sex_2011 = pd.read_csv(os.path.join("census", "2011", "Sex (UK).csv"))
sex_2011 = update_census_districts(sex_2011)
sex_2011['male_female_ratio'] = sex_2011["Sex: Males; measures: Value"] / sex_2011["Sex: Females; measures: Value"]
sex_2011 = sex_2011[['geography', 'male_female_ratio']]
sex_2011.head()

In [None]:
population_density_2011 = pd.read_csv(os.path.join("census", "2011", "Population density (UK).csv"))
population_density_2011 = update_census_districts(population_density_2011)
population_density_2011['population'] = population_density_2011['Area/Population Density: All usual residents; measures: Value'] 
population_density_2011['population_density'] = population_density_2011['Area/Population Density: All usual residents; measures: Value'] / population_density_2011['Area/Population Density: Area Hectares; measures: Value']
population_density_2011 = population_density_2011[['geography', 'population', 'population_density']]
population_density_2011.head()

In [None]:
ethnicity_2011 = pd.read_csv(os.path.join("census", "2011", "Ethnicity (UK).csv"))
ethnicity_2011 = update_census_districts(ethnicity_2011)
ethnicity_2011['black_population_percent'] = ethnicity_2011['Ethnic Group: Black / African / Caribbean / Black British; measures: Value'] / ethnicity_2011['Ethnic Group: All categories: Ethnic group; measures: Value']
ethnicity_2011['white_population_percent'] = ethnicity_2011['Ethnic Group: White; measures: Value'] / ethnicity_2011['Ethnic Group: All categories: Ethnic group; measures: Value']
ethnicity_2011 = ethnicity_2011[['geography', 'black_population_percent', 'white_population_percent']]
ethnicity_2011.head()

In [None]:
education_2011 = pd.read_csv(os.path.join("census", "2011", "Education level (UK).csv"))
education_2011 = update_census_districts(education_2011)
education_2011['low_education_percent'] = education_2011['Qualification: No qualifications; measures: Value'] / education_2011['Qualification: All categories: Highest level of qualification; measures: Value']
education_2011 = education_2011[['geography', 'low_education_percent']]
education_2011.head()

In [None]:
ltlas_gdf = gpd.GeoDataFrame.from_file(os.path.join("gis", 'lad19.geojson'))
ltlas_gdf = ltlas_gdf[['district_id', 'district_name']]
districts = pd.unique(ltlas_gdf['district_name'])
district_ids = pd.unique(ltlas_gdf['district_id'])
ltlas_gdf.head()

In [None]:
census_df = pd.merge(ltlas_gdf, age_2011, left_on="district_name", right_on="geography", how="left")
census_df = pd.merge(census_df, sex_2011, left_on="geography", right_on="geography", how="left")
census_df = pd.merge(census_df, population_density_2011, left_on="geography", right_on="geography", how="left")
census_df = pd.merge(census_df, ethnicity_2011, left_on="geography", right_on="geography", how="left")
census_df = pd.merge(census_df, education_2011, left_on="geography", right_on="geography", how="left")

census_df = census_df.sort_values(['district_name', 'district_id'])
census_df = census_df.reset_index()
census_df = census_df.drop('index', axis=1)
census_df = census_df.drop('geography', axis=1)
census_df.head()

In [10]:
district_mapping_2021_census = {
    # from district names to 2021 census names
    "Aylesbury Vale": "Buckinghamshire",
    "Chiltern": "Buckinghamshire",
    "Corby": "North Northamptonshire",
    "Daventry": "West Northamptonshire",
    "East Northamptonshire": "North Northamptonshire",
    "Kettering": "North Northamptonshire",
    "Northampton": "West Northamptonshire",
    "South Bucks": "Buckinghamshire",
    "South Northamptonshire": "West Northamptonshire",
    "Wellingborough": "North Northamptonshire",
    "Wycombe": "Buckinghamshire"
}


def update_census_data(census_df, new_data, attribute_name):
    for district_id in census_df['district_id'].unique():
        # for England and Wales
        if district_id.startswith("E") or district_id.startswith("W"):
            # Hackney and City of London
            if district_id in ["E09000001", "E09000012"]:
                updated_value = sum(new_data[new_data['Area code [note 2]'].isin(["E09000001", "E09000012"])][attribute_name].values)
                census_df.loc[census_df['district_id'] == "E09000012", attribute_name] = updated_value
            # Cornwall and Isles of Scilly
            elif district_id in ["E06000052", "E06000053"]:
                updated_value = sum(new_data[new_data['Area code [note 2]'].isin(["E06000052", "E06000053"])][attribute_name].values)
                census_df.loc[census_df['district_id'] == "E06000052", attribute_name] = updated_value
            else:
                updated_value = new_data[new_data['Area code [note 2]']
                                        == district_id][attribute_name].values
                if len(updated_value):
                    updated_value = updated_value[0]
                    census_df.loc[census_df['district_id'] ==
                                district_id, attribute_name] = updated_value
                else:
                    district_name = census_df[census_df['district_id']
                                            == district_id]['district_name'].values[0]
                    assert(district_name in district_mapping_2021_census)
                    new_district_name = district_mapping_2021_census[district_name]
                    updated_value = new_data[new_data['Area name']
                                            == new_district_name][attribute_name].values
                    if len(updated_value):
                        updated_value = updated_value[0]
                        census_df.loc[census_df['district_id'] ==
                                    district_id, attribute_name] = updated_value
    return census_df

In [11]:
sex_2021 = pd.read_excel(os.path.join("census", "2021", "Population density, age, sex (England and Wales).xlsx"), sheet_name="P01", skiprows=6)
sex_2021['male_female_ratio'] = sex_2021['Males'] / sex_2021['Females']
sex_2021['population'] = sex_2021['All persons']
census_df = update_census_data(census_df, sex_2021, 'male_female_ratio')
census_df = update_census_data(census_df, sex_2021, 'population')

In [12]:
young_variables = ["Aged 4 years and under\n[note 12]",
                   "Aged 5 to 9 years\n[note 12]",
                   "Aged 10 to 14 years\n[note 12]",
                   "Aged 15 to 19 years\n[note 12]"]
old_variables = ["Aged 65 to 69 years\n[note 12]",
                 "Aged 70 to 74 years\n[note 12]",
                 "Aged 75 to 79 years\n[note 12]",
                 "Aged 80 to 84 years\n[note 12]",
                 "Aged 85 to 89 years\n[note 12]",
                 "Aged 90 years and over\n[note 12]"]
age_2021 = pd.read_excel(os.path.join("census", "2021", "Population density, age, sex (England and Wales).xlsx"), sheet_name="P02", skiprows=7)
age_2021["young_percent"] = age_2021[young_variables].sum(axis=1) / age_2021['All persons']
age_2021["old_percent"] = age_2021[old_variables].sum(axis=1) / age_2021['All persons']

census_df = update_census_data(census_df, age_2021, 'young_percent')
census_df = update_census_data(census_df, age_2021, 'old_percent')

In [13]:
population_density_2021 = pd.read_excel(os.path.join("census", "2021", "Population density, age, sex (England and Wales).xlsx"), sheet_name="P04", skiprows=6)
# square kilometer to hectare
population_density_2021['population_density'] = population_density_2021['Population density (number of usual residents per square kilometre) \n[note 13]'] / 100
census_df = update_census_data(census_df, population_density_2021, 'population_density')

In [14]:
assert(len(ltlas_gdf['district_id'].unique()) == len(census_df['district_id'].unique()))

In [15]:
census_df.to_csv("../census.csv", float_format="%.3f", na_rep="N/A", index=False)