In [6]:
import os 
import pandas as pd
import re
import numpy as np

from sklearn import preprocessing
from sklearn import decomposition 

In [7]:
try:
    census_df = pd.read_csv(r'.\data\clean\normalized_percents.csv')
except:
    os.chdir('..')
    census_df = pd.read_csv(r'.\data\clean\normalized_percents.csv')
census_df = census_df.dropna(axis=1, how='any')
census_df

Unnamed: 0,MSOA Code,MSOA Name,City Code,City Name,Year,Age: 0 to 4,Age: 10 to 14,Age: 15 to 19,Age: 16 years and over,Age: 20 to 24,...,Unpaid Care: Provides no unpaid care,Vehicles: 1 car or van in household,Vehicles: 2 cars or vans in household,Vehicles: 3 or more cars or vans in household,Vehicles: No cars or vans in household,Welsh: Can speak Welsh,Welsh: Can speak and read but cannot write Welsh,Welsh: Can speak but cannot read or write Welsh,Welsh: No skills in Welsh,Welsh: Other combination of skills in Welsh
0,E02000001,City of London 001,J01000055,London,2001,0.034795,0.025887,0.028671,4044.729019,0.073626,...,0.000000,0.326648,0.042416,0.010604,0.620332,0.000000,0.000000,0.000000,0.000000,0.000000
1,E02000001,City of London 001,J01000055,London,2011,0.032000,0.022915,0.026712,6307.060746,0.073898,...,0.921898,0.250855,0.039453,0.015735,0.693957,0.000000,0.000000,0.000000,0.000000,0.000000
2,E02000001,City of London 001,J01000055,London,2021,0.024825,0.020280,0.025175,8004.000000,0.112471,...,0.940755,0.194179,0.025036,0.008752,0.772033,0.000000,0.000000,0.000000,0.000000,0.000000
3,E02000002,Barking and Dagenham 001,J01000055,London,2001,0.078403,0.067821,0.061247,2748.110470,0.058201,...,0.000000,0.434894,0.137527,0.031090,0.396489,0.000000,0.000000,0.000000,0.000000,0.000000
4,E02000002,Barking and Dagenham 001,J01000055,London,2011,0.092694,0.065830,0.064059,4360.098007,0.062583,...,0.908782,0.437154,0.156285,0.030593,0.375968,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9774,W02000422,Cardiff 048,J01000020,Cardiff,2011,0.048651,0.010922,0.018203,5537.006950,0.155552,...,0.951018,0.629174,0.174330,0.020799,0.175698,0.108703,0.009804,0.010664,0.835397,0.018404
9775,W02000422,Cardiff 048,J01000020,Cardiff,2021,0.049398,0.018000,0.016782,6659.000000,0.122615,...,0.951730,0.584104,0.149466,0.014235,0.252195,0.104175,0.004329,0.009496,0.839548,0.146628
9776,W02000423,Cardiff 049,J01000020,Cardiff,2001,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9777,W02000423,Cardiff 049,J01000020,Cardiff,2011,0.065994,0.038041,0.041808,6579.047642,0.117526,...,0.931211,0.485479,0.124637,0.016699,0.373185,0.087194,0.006981,0.012565,0.862038,0.023607


In [8]:
census_raw_01 = census_df[census_df['Year'] == 2001]
census_raw_11 = census_df[census_df['Year'] == 2011]
census_raw_21 = census_df[census_df['Year'] == 2021]

# census_raw_01 = census_raw_01.dropna(axis=1, how='all')
# census_raw_11 = census_raw_11.dropna(axis=1, how='all')
# census_raw_21 = census_raw_21.dropna(axis=1, how='all')

print(f'Num rows in 2001: {len(census_raw_01)} \nNum rows in 2011: {len(census_raw_11)} \nNum rows in 2021: {len(census_raw_21)} \n')

Num rows in 2001: 3259 
Num rows in 2011: 3260 
Num rows in 2021: 3260 



In [9]:
# subset to the columns used for scoring
def extract(df, verbose=False):

    d = df.copy()
    
    pattern = "(MSOA Code)|(.*income.*)|(.*In employment.*)|(^Occupation:.*)|(Age: 16 years and over)|(^Highest Level of Qualification:.*)|(.*house price.*)"
    m = re.compile(pattern, re.I)
    l = [c for c in d.columns if m.match(c)]
    d = d[l]

    # change df to numeric
    d.columns = map(str.lower, d.columns)
    num_cols = [c for c in d.columns if c != 'msoa code']
    d[num_cols] = d[num_cols].apply(pd.to_numeric)
    
    if verbose:
        print(f'd:\n{d.dtypes}\n')

    # occ_cols = list(d.filter(regex='(.*manager.*)|(.*professional.*)').columns)
    
    # d['sum_occ'] = d.filter(regex='(.*manager.*)|(.*professional.*)').sum(axis=1)
    # d['total_occ'] = d.filter(regex='^occupation:.*').sum(axis=1)
    # d['qual'] = d.filter(regex='.*level 4.*')
    # d['total_qual'] =  d.filter(regex='.*qualification:.*').sum(axis=1)
    
    d['occupation'] = d.filter(regex='(.*manager.*)|(.*professional.*)').sum(axis=1)
    d['qualification'] = d.filter(regex='.*level 4.*')
    d['log_med_house_price'] = np.log(d.filter(regex='.*house price.*') + 1)
    d['log_med_income'] = np.log(d.filter(regex='.*income.*') + 1)

    d['msoa'] = d['msoa code']

    cols = ['msoa', 'occupation', 'qualification', 'log_med_house_price', 'log_med_income']

    d = d.filter(cols)
    
    if verbose:
        print(f'Dataframe after processing:\n{d.dtypes}')
    
    assert len(d) == len(df), 'Out df has wrong num rows...'
    
    return d

In [10]:
# scale and pca
def scale_pca(df1, df2, verbose=False):
    
    c11 = extract(df1)
    c21 = extract(df2)
    
    # c11 = c11.iloc[:, 1:]
    # c21 = c21.iloc[:, 1:]
    
    # turn msoa into index
    c11.set_index('msoa', inplace=True)
    c21.set_index('msoa', inplace=True)
    
    common_index = c11.merge(c21, how='inner', on='msoa').index
    
    # use c11 index to filter c21 rows
    c11 = c11.loc[common_index]
    c21 = c21.loc[common_index]
    
    # scale data
    stacked = np.concatenate((c11.values, c21.values), axis=0)
    
    scale = preprocessing.RobustScaler().fit(stacked)
    
    data_scaled = scale.transform(stacked)
    
    # pca
    pca = decomposition.PCA(n_components=1)
    pca.fit(data_scaled)            
    scores_T = pd.DataFrame(pca.transform(stacked))
    
    
    # attach scores back to original dfs
    score_11 = scores_T.loc[: len(c11)-1, 0]
    score_21 = scores_T.loc[len(c11): , 0]

    c11 = c11.assign(score=pd.Series(score_11).values)
    c21 = c21.assign(score=pd.Series(score_21).values)
    
    if verbose:
        print(f'c11 num rows: {len(c11)}\nc11:\n{c11.head(5)}\n')
        print(f'c21 num rows: {len(c21)}\nc11:\n{c21.head(5)}\n')
    
    # merge and calculate additional statistics from the scores
    combined = c11.merge(c21, how='inner', on='msoa', suffixes=('_11', '_21'))
    
    # 2011 lsoa rank
    combined['rank_11'] = combined.score_11.rank(ascending=False)

    # 2021 lsoa rank
    combined['rank_21'] = combined.score_21.rank(ascending=False)

    # 2011 to 2021 score increase
    combined.loc[:,'ses_asc'] = combined.loc[:,'score_21'] - combined.loc[:,'score_11']
    
    # 2011 LSOA percentile score
    combined.loc[:,'score_pr_11'] = combined.rank_11.rank(ascending=False, pct=True) * 100

    # 2021 LSOA percentile score
    combined.loc[:,'score_pr_21'] = combined.score_21.rank(ascending=False, pct=True) * 100

    # Calculate percentile change
    combined.loc[:,'score_pr_asc'] = combined.loc[:,'score_pr_21'] - combined.loc[:,'score_pr_11']
    
    if verbose:
        print(f'combined:\n{combined.dtypes}\n')
    
    return combined

# write to csv
test = scale_pca(census_raw_11, census_raw_21)
test.to_csv(r'.\data\clean\scores.csv')
    