In [1]:
import os 
import pandas as pd
import re
import numpy as np

from sklearn import preprocessing
from sklearn import decomposition 

In [2]:
try:
    census_df = pd.read_csv(r'.\data\clean\normalized_all.csv')
except:
    os.chdir('..')
    census_df = pd.read_csv(r'.\data\clean\normalized_all.csv')
census_df = census_df.dropna(axis=1, how='any')
census_df

Unnamed: 0,MSOA Code,MSOA Name,City Code,City Name,Year,Age: 0 to 4,Age: 10 to 14,Age: 15 to 19,Age: 16 years and over,Age: 20 to 24,...,Unpaid Care: Provides no unpaid care,Vehicles: 1 car or van in household,Vehicles: 2 cars or vans in household,Vehicles: 3 or more cars or vans in household,Vehicles: No cars or vans in household,Welsh: Can speak Welsh,Welsh: Can speak and read but cannot write Welsh,Welsh: Can speak but cannot read or write Welsh,Welsh: No skills in Welsh,Welsh: Other combination of skills in Welsh
0,E02000001,City of London 001,J01000055,London,2001,250.0,186.0,206.0,6515.0,529.0,...,6587.0,1417.0,184.0,46.0,2691.0,0.0,0.0,0.0,0.0,0.0
1,E02000001,City of London 001,J01000055,London,2011,236.0,169.0,197.0,3159.0,545.0,...,6799.0,1100.0,173.0,69.0,3043.0,0.0,0.0,0.0,0.0,0.0
2,E02000001,City of London 001,J01000055,London,2021,213.0,174.0,216.0,8004.0,965.0,...,7876.0,954.0,123.0,43.0,3793.0,0.0,0.0,0.0,0.0,0.0
3,E02000002,Barking and Dagenham 001,J01000055,London,2001,489.0,423.0,382.0,4811.0,363.0,...,5624.0,1189.0,376.0,85.0,1084.0,0.0,0.0,0.0,0.0,0.0
4,E02000002,Barking and Dagenham 001,J01000055,London,2011,628.0,446.0,434.0,3026.0,424.0,...,6157.0,1186.0,424.0,83.0,1020.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9774,W02000422,Cardiff 048,J01000020,Cardiff,2011,294.0,66.0,110.0,2493.0,940.0,...,5747.0,2299.0,637.0,76.0,642.0,632.0,57.0,62.0,4857.0,107.0
9775,W02000422,Cardiff 048,J01000020,Cardiff,2021,365.0,133.0,124.0,6659.0,906.0,...,6684.0,2462.0,630.0,60.0,1063.0,746.0,31.0,68.0,6012.0,1050.0
9776,W02000423,Cardiff 049,J01000020,Cardiff,2001,384.0,334.0,330.0,4815.0,612.0,...,5504.0,1263.0,376.0,55.0,1263.0,0.0,39.0,95.0,4920.0,110.0
9777,W02000423,Cardiff 049,J01000020,Cardiff,2011,543.0,313.0,344.0,3378.0,967.0,...,7662.0,2006.0,515.0,69.0,1542.0,687.0,55.0,99.0,6792.0,186.0


In [3]:
census_raw_01 = census_df[census_df['Year'] == 2001]
census_raw_11 = census_df[census_df['Year'] == 2011]
census_raw_21 = census_df[census_df['Year'] == 2021]

# census_raw_01 = census_raw_01.dropna(axis=1, how='all')
# census_raw_11 = census_raw_11.dropna(axis=1, how='all')
# census_raw_21 = census_raw_21.dropna(axis=1, how='all')

print(f'Num rows in 2001: {len(census_raw_01)} \nNum rows in 2011: {len(census_raw_11)} \nNum rows in 2021: {len(census_raw_21)} \n')

Num rows in 2001: 3259 
Num rows in 2011: 3260 
Num rows in 2021: 3260 



In [4]:
# subset to the columns used for scoring
def extract(df, verbose=False):

    d = df.copy()
    
    pattern = "(MSOA Code)|(.*income.*)|(.*In employment.*)|(^Occupation:.*)|(Age: 16 years and over)|(^Highest Level of Qualification:.*)|(.*house price.*)"
    m = re.compile(pattern, re.I)
    l = [c for c in d.columns if m.match(c)]
    d = d[l]

    # change df to numeric
    d.columns = map(str.lower, d.columns)
    num_cols = [c for c in d.columns if c != 'msoa code']
    d[num_cols] = d[num_cols].apply(pd.to_numeric)
    
    if verbose:
        print(f'd:\n{d.dtypes}\n')

    occ_cols = list(d.filter(regex='(.*manager.*)|(.*professional.*)').columns)
    
    d['sum_occ'] = d.filter(regex='(.*manager.*)|(.*professional.*)').sum(axis=1)
    d['total_occ'] = d.filter(regex='^occupation:.*').sum(axis=1)
    d['qual'] = d.filter(regex='.*level 4.*')
    d['total_qual'] =  d.filter(regex='.*qualification:.*').sum(axis=1)
    
    d['occupation'] = d['sum_occ'] / d['total_occ']
    d['qualification'] = d['qual'] / d['total_qual']
    d['log_med_house_price'] = np.log(d.filter(regex='.*house price.*') + 1)
    d['log_med_income'] = np.log(d.filter(regex='.*income.*') + 1)

    d['msoa'] = d['msoa code']

    cols = ['msoa', 'occupation', 'qualification', 'log_med_house_price', 'log_med_income']

    d = d.filter(cols)
    
    if verbose:
        print(f'Dataframe after processing:\n{d.dtypes}')
    
    assert len(d) == len(df), 'Out df has wrong num rows...'
    
    return d

In [5]:
# scale and pca
def scale_pca(df1, df2, verbose=False):
    
    c11 = extract(df1)
    c21 = extract(df2)
    
    # c11 = c11.iloc[:, 1:]
    # c21 = c21.iloc[:, 1:]
    
    # turn msoa into index
    c11.set_index('msoa', inplace=True)
    c21.set_index('msoa', inplace=True)
    
    common_index = c11.merge(c21, how='inner', on='msoa').index
    
    # use c11 index to filter c21 rows
    c11 = c11.loc[common_index]
    c21 = c21.loc[common_index]
    
    # scale data
    stacked = np.concatenate((c11.values, c21.values), axis=0)
    
    scale = preprocessing.RobustScaler().fit(stacked)
    
    data_scaled = scale.transform(stacked)
    
    # pca
    pca = decomposition.PCA(n_components=1)
    pca.fit(data_scaled)            
    scores_T = pd.DataFrame(pca.transform(stacked))
    
    
    # attach scores back to original dfs
    score_11 = scores_T.loc[: len(c11)-1, 0]
    score_21 = scores_T.loc[len(c11): , 0]

    c11 = c11.assign(score=pd.Series(score_11).values)
    c21 = c21.assign(score=pd.Series(score_21).values)
    
    if verbose:
        print(f'c11 num rows: {len(c11)}\nc11:\n{c11.head(5)}\n')
        print(f'c21 num rows: {len(c21)}\nc11:\n{c21.head(5)}\n')
    
    # merge and calculate additional statistics from the scores
    combined = c11.merge(c21, how='inner', on='msoa', suffixes=('_11', '_21'))
    
    # 2011 lsoa rank
    combined['rank_11'] = combined.score_11.rank(ascending=False)

    # 2021 lsoa rank
    combined['rank_21'] = combined.score_21.rank(ascending=False)

    # 2011 to 2021 score increase
    combined.loc[:,'ses_asc'] = combined.loc[:,'score_21'] - combined.loc[:,'score_11']
    
    # 2011 LSOA percentile score
    combined.loc[:,'score_pr_11'] = combined.rank_11.rank(ascending=False, pct=True) * 100

    # 2021 LSOA percentile score
    combined.loc[:,'score_pr_21'] = combined.score_21.rank(ascending=False, pct=True) * 100

    # Calculate percentile change
    combined.loc[:,'score_pr_asc'] = combined.loc[:,'score_pr_21'] - combined.loc[:,'score_pr_11']
    
    if verbose:
        print(f'combined:\n{combined.dtypes}\n')
    
    return combined

# write to csv
test = scale_pca(census_raw_11, census_raw_21)
test.to_csv(r'.\data\clean\scores.csv')
    