In [1]:
import os 
import pandas as pd
import re

import random
from sklearn import preprocessing

  from pandas.core import (


In [2]:
#os.listdir()
PATH = os.getcwd()

# Exploring Raw Census Data

In [3]:
# load 2001 and 2011 data
census_raw_01 = pd.read_csv('Census2001.csv')
census_raw_11 = pd.read_csv('Census2011.csv')

# drop nan rows
census_raw_01.dropna(inplace=True)
census_raw_11.dropna(inplace=True)

#display(census_raw_01)
print(f'Num rows in 2001: {len(census_raw_01)} \nNum rows in 2011: {len(census_raw_11)} \n')

Num rows in 2001: 1609 
Num rows in 2011: 1673 



Cell below explores joining on Pincode and LSOA columns. <br>
- Inner joining on LSOA reduces dataset size to 1596.
- There are 1673 rows in 2011 and 1607 in 2001
<br>

Should join on LSOA
- Gov merges or splits LSOAs based on population; not sure which ones are split or merged. Can only using matching 

In [4]:
# lets try a left join with 2011 as left table and check how many went from gentrifiable to ungentrifiable
# also look at how many went from ungentrifiable to gentrifiable

# joining on Pincode returns correct rows, but on LSOA leaves out 13=1609-1596 rows. 
# PINCODE IS NOT POSTCODE
# LSOAs may have been renamed

# since we don't know how the gov merged or split LSOAs can only train and predict on LSOAs that were not changed
# and that were present in both census years.
# SO merge on LSOA column

status_01 = census_raw_01[['LSOA', 'Pincode', 'Is gentrifiable?']]
status_11 = census_raw_11[['LSOA', 'Pincode', 'Is gentrifiable?']]

status_merge_lsoa = status_01.merge(status_11, how='inner', on='LSOA', suffixes=['_01', '_11'])
status_merge_pincode = status_01.merge(status_11, how='inner', on='Pincode', suffixes=['_01', '_11'])

print(f'Inner Join on LSOA column w/ 2001 as left table leaves this many rows: {len(status_merge_lsoa)}\n')
print(f'Inner Join on Pincode column w/ 2001 as left table leaves this many rows: {len(status_merge_pincode)}\n')

Inner Join on LSOA column w/ 2001 as left table leaves this many rows: 1596

Inner Join on Pincode column w/ 2001 as left table leaves this many rows: 1609



Cell below checks if there are non-unique values in LSOA or Pincode columns for merging purposes. <br>
- Same num distinct LSOA and Pincodes as rows for both years. 

Note: Pincode IS NOT Postcode (based on how UK Postcodes are formatted)

In [5]:
# check for where pincode matches but LSOA does not
# so the higher level area did not change, but the numeric part of code and the letter did
# from UK gov, LSOAs can be split or merged between census years - to maintain population in each LSOA
# the num rows matches unique LSOAs and Pincodes for both census years
# Postcode is suppose to be most granular

lsoa_mask = status_merge_pincode['LSOA_01'] != status_merge_pincode['LSOA_11']
lsoa_diff = status_merge_pincode[lsoa_mask]
#print(status_merge_pincode.columns.unique())

#print(lsoa_diff)

# checking for num unique pincodes and lsoa codes in each

num_lsoa_01 = status_01.LSOA.unique()
num_lsoa_11 = status_11.LSOA.unique()
num_pin_01 = status_01.Pincode.unique()
num_pin_11 = status_11.Pincode.unique()

print(f'Unique LSOA 01: {len(num_lsoa_01)}; Unique LSOA 11 {len(num_lsoa_11)}\nDistinct Pincodes 01: {len(num_pin_01)}; Distinct Pincodes 11: {len(num_pin_11)}')

Unique LSOA 01: 1609; Unique LSOA 11 1673
Distinct Pincodes 01: 1609; Distinct Pincodes 11: 1673


In [6]:
# checking the change gentrifiable > ungentrifiable and vice versa

gent_ungent_mask = ((status_merge_lsoa['Is gentrifiable?_01'] == 'Gentrifiable')
                  & (status_merge_lsoa['Is gentrifiable?_11'] == 'Ungentrifiable'))
ungent_gent_mask = ((status_merge_lsoa['Is gentrifiable?_01'] == 'Ungentrifiable')
                  & (status_merge_lsoa['Is gentrifiable?_11'] == 'Gentrifiable'))

count_gent_ungent = len(status_merge_lsoa[gent_ungent_mask])
count_ungent_gent = len(status_merge_lsoa[ungent_gent_mask])

print(f'Num Gentrified to Ungentrified: {count_gent_ungent}\nNum Ungentrified to Gentrified: {count_ungent_gent}')

Num Gentrified to Ungentrified: 128
Num Ungentrified to Gentrified: 177


# Processing Census Data

In [7]:
def household_structure_count_to_perc(df, drop_count=True, verbose=False):
    
    # filter to household structure cols
    pattern = "(LSOA)|(Pincode)|(All households)|(No adults\s.*)|(With dependent children\s.*)"
    
    m = re.compile(pattern)
    cols = [c for c in df.columns.tolist() if m.match(c)]
    household = df[cols]
    #household = df.filter(regex=) #works for 2001
    
    household.columns = map(str.lower, household.columns)
    
    if verbose:
        print(f'household data:\n {household.dtypes}')
    
    assert len(household.columns.tolist()) == len(cols), 'Wrong subset of columns'
    
    # rename cols for ease
    household.rename(columns=lambda x: re.sub('all households','total',x), inplace=True)
    household.rename(columns=lambda x: re.sub('no adults in employment - with\s.*','no_emply_kids',x), inplace=True)
    household.rename(columns=lambda x: re.sub('no adults in employment - with.*','no_emply_no_kids',x), inplace=True)
    household.rename(columns=lambda x: re.sub('.*all ages','kids_all_ages',x), inplace=True)
    household.rename(columns=lambda x: re.sub('.*- aged.*','kids_0_4',x), inplace=True)
    
    # change col dtype to numeric
    num_cols = [c for c in household.columns if c not in ['pincode', 'lsoa']]
    household[num_cols] = household[num_cols].apply(pd.to_numeric)
    
    # create other col
    household['house_struct_other'] = (household['total'] - 
                          household.loc[:, ~household.columns.isin(['total', 'pincode', 'lsoa'])].sum(axis=1))
    
    if verbose:
        print(f'After creating other col:\n {household.head(5)}')
    
    # get percentages
    pct_cols = [c for c in household.columns if c not in ['total', 'pincode', 'lsoa']]
    for c in pct_cols:
        household[c+'_pct'] = household.loc[:, c] / household.loc[:, 'total']
        # drop og count column
        if drop_count:
            household.drop(c, axis=1, inplace=True)
    
    if verbose:
        print(f'After changing to percentages:\n {household.head(5)}')

    # drop total col
    if drop_count:
        household.drop('total', axis=1, inplace=True)
    
    assert len(household) == len(df), 'Household and input df have different number of rows'
    
    return household

In [8]:
def age_count_to_perc(df, drop_count=True, verbose=False):
    
    # subset df to age columns
    pattern = "(LSOA)|(Pincode)|(.*\sresidents$)|(^Age\s.*)|(.*Age$)"
    m = re.compile(pattern)
    cols = [c for c in df.columns if m.match(c)]
    age_df = df[cols]
    
    assert len(age_df.columns) == len(cols), 'Wrong subset of columns...'
    
    age_df.columns = map(str.lower, age_df.columns)
    
    if verbose:
        print(f'age_df:\n {age_df.dtypes}')
    
    # change col dtype to numeric
    num_cols = [c for c in age_df.columns if c not in ['lsoa', 'pincode']]
    age_df[num_cols] = age_df[num_cols].apply(pd.to_numeric)
    
    if verbose:
        print(f'After changing to numeric:\n {age_df.dtypes}')
    
    # rename to total
    age_df.rename(columns=lambda x: re.sub(".*\sresidents$", 'total', x), inplace=True)
    
    # create other col
    age_df['age_other'] = (age_df['total'] - age_df.filter(regex="^age.*").sum(axis=1))
    
    # group ages
    age_df['young_children'] = age_df[['age 0 to 4', 'age 5 to 7', 'age 8 to 9']].sum(axis=1)
    age_df['teens'] = age_df[['age 10 to 14', 'age 15', 'age 16 to 17']].sum(axis=1)
    age_df['young_adults'] = age_df[['age 18 to 19', 'age 20 to 24', 'age 25 to 29']].sum(axis=1)
    age_df['adults'] = age_df[['age 30 to 44', 'age 45 to 59', 'age 60 to 64']].sum(axis=1)
    age_df['retired'] = age_df[['age 65 to 74', 'age 75 to 84']].sum(axis=1)
    age_df['elderly'] = age_df[['age 85 to 89', 'age 90 and over']].sum(axis=1)
    
    # drop original age columns
    age_df.drop(list(age_df.filter(regex="^age.*")), axis=1, inplace=True)
    
    if verbose:
        print(f'After changing to numeric:\n {age_df.dtypes}')
    
    # change from count to percent
    pct_cols = [c for c in age_df.columns if c not in ['total', 'lsoa', 'mean age', 'median age', 'pincode']]
    
    for c in pct_cols:
        age_df[c+'_pct'] = age_df[c] / age_df['total']
        if drop_count:
            age_df.drop(labels=c, axis=1, inplace=True)
    
    if drop_count:
        age_df.drop(labels='total', axis=1, inplace=True)
    
    if verbose:
        print(f'After changing to percent:\n {age_df.dtypes}')
    
    assert len(age_df) == len(df), 'Age and input df have different number of rows'
    
    return age_df
    

In [9]:
def vehicles_to_pct(df, drop_counts=True, verbose=False):
    
    # subset to cars or vans col
    pattern = "(LSOA)|(Pincode)|(^[a-zA-Z\s:]*car[a-zA-Z\s]*van[a-zA-Z\s]*)|(^[0-9].*car[a-zA-Z\s]*van[a-zA-Z\s]*household$)"
    m = re.compile(pattern, re.I)
    cols = [c for c in census_raw_01.columns if m.match(c)]
    vehicles_df = df[cols]
    
    assert len(vehicles_df.columns) == len(cols), 'Incorrect Subset of Columns'
    
    vehicles_df.drop(list(vehicles_df.filter(regex="^sum.*")), axis=1, inplace=True)
    
    vehicles_df.columns = map(str.lower, vehicles_df.columns)
    
    if verbose:
        print(f'vehicles_df:\n {vehicles_df.dtypes}')
    
    # rename col to total
    vehicles_df.rename(columns=lambda x: re.sub(".*availability$", 'total', x), inplace=True)
    
    # change cols to numeric
    num_cols = [c for c in vehicles_df.columns if c not in ['lsoa', 'pincode']]
    vehicles_df[num_cols] = vehicles_df[num_cols].apply(pd.to_numeric)
    
    if verbose:
        print(f'After changing cols to numeric:\n {vehicles_df.dtypes}')
    
    # rename cols
    rename_cols = list(vehicles_df.filter(regex=".*household$"))
    
    for c in rename_cols:
        num = c.split(' ')[0]
        vehicles_df.rename(columns={c: num+'_cars_vans'}, inplace=True)
    
    if verbose:
        print(f'After changing names:\n {vehicles_df.dtypes}')
    
    # count to percent
    pct_cols = list(vehicles_df.filter(regex=".*vans$"))
    
    for c in pct_cols:
        vehicles_df[c+'_pct'] = vehicles_df[c] / vehicles_df['total']
    
    if drop_counts:
        vehicles_df.drop(labels=pct_cols, axis=1, inplace=True)
        vehicles_df.drop('total', axis=1, inplace=True)
    
    if verbose:
        print(f'After changing to percent:\n {vehicles_df.dtypes}')
        
    assert len(vehicles_df) == len(df), 'Vehicle and input df have different number of rows'

    return vehicles_df

In [10]:
def economic_count_pct(df, drop_count=True, verbose=False):
    
    #subset cols
    pattern = "(LSOA)|(Pincode)|(.*16[a-z\s]*74$)|(^economically.*:.*)|(^unemployed.*)|(^long.*unemployed$)"
    m = re.compile(pattern, re.I)
    cols = [c for c in df.columns if m.match(c)]
    econ_df = df[cols]
    
    if verbose:
        print(f'After first regex:\n{econ_df.dtypes}')
    
    pattern2 = "(LSOA)|(Pincode)|(.*16[a-z\s]*74$)|(.*employe.*)|(.*unemployed)|(.*student$)|(.*inactive.*)|(^unemploy.*)"
    m2 = re.compile(pattern2, re.I)
    cols2 = [c for c in econ_df.columns if m2.match(c)]
    econ_df = econ_df[cols2]
    
    econ_df.columns = map(str.lower, econ_df.columns)
    
    if verbose:
        print(f'After second regex:\n{econ_df.dtypes}')
    
    # group by econ active emply; econ active unemply; econ active student; econ inactive retired;
    #econ inactive student; econ inactive other; unemply
    # change to numeric
    num_cols = [c for c in econ_df.columns if c not in ['lsoa', 'pincode']]
    econ_df[num_cols] = econ_df[num_cols].apply(pd.to_numeric)
    
    
    # sum inactive cols
    econ_df['inactive'] = econ_df.filter(regex='.*inactive.*').sum(axis=1)
    econ_df.drop(list(econ_df.filter(regex='.*inactive.+')), axis=1, inplace=True)
    
    # sum not econ active unemployed cols
    econ_df['unemply'] = econ_df.filter(regex='(^unemployed)|^long.*unemployed$').sum(axis=1)
    econ_df.drop(list(econ_df.filter(regex='(^unemployed)|^long.*unemployed$')), axis=1, inplace=True)
    
    # rename to total
    econ_df.rename(columns=lambda x: re.sub('.*16[a-z\s]*74$', 'total', x), inplace=True)
    
    if verbose:
        print(f'After grouping:\n{econ_df.head(5)}')
    
    # rename cols 
    econ_df.rename(columns=lambda x: re.sub('.*part.*', 'pt', x), inplace=True)
    econ_df.rename(columns=lambda x: re.sub('.*employ.*full.*', 'ft', x), inplace=True)
    econ_df.rename(columns=lambda x: re.sub('.*self.*', 'self_emply', x), inplace=True)
    econ_df.rename(columns=lambda x: re.sub('.*student.*', 'active_student', x), inplace=True)
    econ_df.rename(columns=lambda x: re.sub('.*active.*unemployed.*', 'active_unemply', x), inplace=True)
    
    # count to pct
    pct_cols = [c for c in econ_df.columns if c not in ['lsoa', 'pincode', 'total']]
    for c in pct_cols:
        econ_df[c+'_pct'] = econ_df[c] / econ_df['total']
    
    if drop_count:
        econ_df.drop(pct_cols, axis=1, inplace=True)
        econ_df.drop('total', axis=1, inplace=True)
    
    
    if verbose:
        print(f'After changing to percent:\n{econ_df.head(5)}')
    
    assert len(econ_df) == len(df), 'Econ df and input df have different number of rows'

    return econ_df

In [11]:
def ethnicity_count_pct(df, drop_count=True, verbose=False):
    
    #subset cols
    pattern = "(LSOA)|(Pincode)|(^[A-Za-z:\s]*ethnic*.[A-Za-z\s]*)|(white$)|(^mixed$)|(^[A-Za-z/]*asian\sbritish$)|(^[A-Za-z/]*black\sbritish$)"
    m = re.compile(pattern, re.I)
    cols = [c for c in df.columns if m.match(c)]
    
    ethnic_df = df[cols]
    
    assert len(ethnic_df.columns) == len(cols), 'Wrong subset of columns'
    
    ethnic_df.columns = map(str.lower, ethnic_df.columns)
    
    if verbose:
        print(f'ethnic_df:\n{ethnic_df.dtypes}')
    
    # change col name to total
    ethnic_df.rename(columns=lambda x: re.sub('^[A-Za-z:\s]*ethnic*.[A-Za-z\s]*', 'total', x), inplace=True)
    
    # change to numeric
    num_cols = [c for c in ethnic_df.columns if c not in ['lsoa', 'pincode']]
    ethnic_df[num_cols] = ethnic_df[num_cols].apply(pd.to_numeric)
    
    if verbose:
        print(f'After changing to numeric:\n{ethnic_df.dtypes}')
    
    # change to percent
    pct_cols = [c for c in num_cols if c != 'total']
    for c in pct_cols:
        ethnic_df[c+'_pct'] = ethnic_df[c] / ethnic_df['total']
    
    if drop_count:
        ethnic_df.drop(pct_cols, axis=1, inplace=True)
        ethnic_df.drop('total', axis=1, inplace=True)
    
    if verbose:
        print(f'After converting to percent:\n{ethnic_df.head(5)}')
    
    assert len(ethnic_df) == len(df), 'Ethnic df and input df have different number of rows'

    
    return ethnic_df
    

In [12]:
def industry_count_pct(df, drop_count=True, verbose=False):
    
    # subset cols
    pattern = "(LSOA)|(Pincode)|([A-Za-z\s:]*industry$)|(^[A-Z]?\s.+)"
    m = re.compile(pattern, re.I)
    cols = [c for c in df.columns if m.match(c)]
    
    ind_df = df[cols]
    
    assert len(ind_df.columns) == len(cols), 'Wrong subset of columns'
    
    if verbose:
        print(f'ind_df:\n{ind_df.dtypes}')
    
    # rename cols 
    ind_df.rename(columns=lambda x: re.sub("([A-Za-z\s:]*Industry$)", 'total', x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sMining.+", "mining", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sManufacturing*", "manufacturing", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sConst.+", "construction", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sWholesale.+", "wholsale_retail", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sHotels.+", "hotels_restaurants", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sFinancial.+", "finance", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sPublic.+", "public_admin", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sEducation.*", "education", x), inplace=True)
    ind_df.rename(columns=lambda x: re.sub("^[A-Z]?\sHealth.+", "health_social_work", x), inplace=True)
    
    if verbose:
        print(f'After renaming:\n{ind_df.dtypes}')
    
    # change to numeric 
    num_cols = [c for c in ind_df.columns if c not in ['LSOA', 'Pincode']]
    ind_df[num_cols] = ind_df[num_cols].apply(pd.to_numeric)
    
    # convert to percent
    pct_cols = [c for c in num_cols if c != 'total']
    for c in pct_cols:
        ind_df[c+'_pct'] = ind_df[c] / ind_df['total']
    
    if drop_count:
        ind_df.drop(pct_cols, axis=1, inplace=True)
        ind_df.drop('total', axis=1, inplace=True)
    
    ind_df.columns = map(str.lower, ind_df.columns)
    
    if verbose:
        print(f'After conversion:\n{ind_df.head(5)}')
        
    assert len(ind_df) == len(df), 'Ind df and input df have different number of rows'

    
    return ind_df
    

- All categories: NS-SeC	
- Large employers and higher managerial and administrative occupations	
- Higher professional occupations	
- Lower managerial, administrative and professional occupations	Intermediate occupations	
- Small employers and own account workers	
- Lower supervisory and technical occupations	
- Semi-routine occupations	
- Routine occupations

In [13]:
def ns_sec_count_pct(df, drop_count=True, verbose=False):
    
    # subset to cols of interest
    pattern = "(LSOA)|(Pincode)|(.+ns-sec$)|([A-Za-z\s]+manager[A-Za-z\s]+)|([A-Za-z\s]+profession[A-Za-z\s]+)|(.*small.*)|([A-Za-z\s-]*routine[A-Za-z\s]+)|([A-Za-z\s]+supervisory[A-Za-z\s]+)"
    m = re.compile(pattern, re.I)
    cols = [c for c in df.columns if m.match(c)]
    
    ns_df = df[cols]
    
    assert len(ns_df.columns) == len(cols), 'Incorrect columns'
    
    if verbose:
        print(f'ns_df:\n{ns_df.dtypes}')
    
    # rename columns
    ns_df.columns = map(str.lower, ns_df.columns)
    
    ns_df.rename(columns=lambda x: re.sub('.+ns-sec$', 'total', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^large.*', 'higher_managerial', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^higher\sprof.*', 'higher_professional', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^lower\smanager.*', 'low_managerial', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^small.*', 'small_emplyer', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^lower.*', 'lower_super_technical', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^semi.*', 'semi_routine', x), inplace=True)
    ns_df.rename(columns=lambda x: re.sub('^routine.*', 'routine', x), inplace=True)
    
    # change cols to numeric
    num_cols = [c for c in ns_df.columns if c not in ['lsoa', 'pincode']]
    ns_df[num_cols] = ns_df[num_cols].apply(pd.to_numeric)
    
    if verbose:
        print(f'After renaming and changing to numeric:\n{ns_df.dtypes}')
    
    # convert to percent
    pct_cols = [c for c in num_cols if c != 'total']
    for c in pct_cols:
        ns_df[c+'_pct'] = ns_df[c] / ns_df['total']
    
    if drop_count:
        ns_df.drop(pct_cols, axis=1, inplace=True)
    ns_df.drop('total', axis=1, inplace=True)
    
    if verbose:
        print(f'After changing to percent:\n{ns_df.head(5)}')

    assert len(ns_df) == len(df), 'NS df and input df have different number of rows'

    return ns_df
    
    

In [14]:
def tenure_count_pct(df, drop_count=True, verbose=False):
    
    # subset to tenure cols
    pattern = "(LSOA)|(Pincode)|(^[A-Za-z\s:]+tenure$)|(^owned.*)|(.+ownership$)|(.*rent.*)"
    m = re.compile(pattern, re.I)
    cols = [c for c in df.columns if m.match(c)]
    
    home_df = df[cols]
    
    assert len(home_df.columns) == len(cols), 'Wrong subset of columns'
    
    home_df.columns = map(str.lower, home_df.columns)
    
    if verbose:
        print(f'home_df:\n{home_df.dtypes}')
    
    # rename 
    home_df.rename(columns=lambda x: re.sub('^[A-Za-z\s:]+tenure$', 'total', x), inplace=True)
    home_df.rename(columns=lambda x: re.sub('.*outright.*', 'own', x), inplace=True)
    home_df.rename(columns=lambda x: re.sub('.*mortgage.*', 'mortgage', x), inplace=True)
    home_df.rename(columns=lambda x: re.sub('.*share.*', 'own_shared', x), inplace=True)
    home_df.rename(columns=lambda x: re.sub('^rent.*council.*', 'rent_council', x), inplace=True)
    home_df.rename(columns=lambda x: re.sub('^rent.*association.*', 'rent_association_socail_landlord', x), inplace=True)
    home_df.rename(columns=lambda x: re.sub('^rent.*private.*landlord.*', 'rent_private_landlord', x), inplace=True)
    
    # convert to numeric 
    num_cols = [c for c in home_df.columns if c not in ['lsoa', 'pincode']]
    home_df[num_cols] = home_df[num_cols].apply(pd.to_numeric)
    
    
    if verbose:
        print(f'After name change and to numeric:\n{home_df.dtypes}')
    
    # change to percent
    pct_cols = [c for c in num_cols if c != 'total']
    
    # adding other col since cols don't add up to total
    home_df['home_other'] = (home_df['total'] - home_df[pct_cols].sum(axis=1))
    
    pct_cols.append('home_other')
    
    for c in pct_cols:
        home_df[c+'_pct'] = home_df[c] / home_df['total']
    
    if drop_count:
        home_df.drop(pct_cols, axis=1, inplace=True)
        home_df.drop('total', axis=1, inplace=True)
    
    if verbose:
        print(f'After changing to percent:\n{home_df.head(5)}')
        
    assert len(home_df) == len(df), 'Home df and input df have different number of rows'

    return home_df
    

For base cols
- leave out mean/median age first run
- People living in households
- People living in communal establishments
- Area (hectares)
- 2001 Density(number of people per hectare)

In [15]:
def base(df, verbose=False):
    
    # take all cols that did not need to be converted to perc
    pattern = "(LSOA)|(Pincode)|(^People\sliving.+)|(^Area.+)|(.*Density.*)"
    m = re.compile(pattern, re.I)
    cols = [c for c in df.columns if m.match(c)]
    
    base_df = df[cols]
    base_df.columns = map(str.lower, base_df.columns)
    
    if verbose:
        print(f'base_df:\n{base_df.dtypes}')
    
    assert len(base_df.columns) == len(cols), 'Wrong subset of columns...'
    assert len(base_df) == len(df), 'Incorrect number of rows...'
    
    # rename cols
    base_df.rename(columns=lambda x: re.sub('.+households$', 'living_in_households', x), inplace=True)
    base_df.rename(columns=lambda x: re.sub('.+communal.+', 'living_in_communal', x), inplace=True)
    base_df.rename(columns=lambda x: re.sub('.+density.+', 'density', x), inplace=True)
    
    # run above fn to convert cols to percent
    hh = household_structure_count_to_perc(df)
    hh.drop('lsoa', axis=1, inplace=True)
    age = age_count_to_perc(df)
    age.drop('lsoa', axis=1, inplace=True)
    veh = vehicles_to_pct(df)
    veh.drop('lsoa', axis=1, inplace=True)
    econ = economic_count_pct(df)
    econ.drop('lsoa', axis=1, inplace=True)
    ethn = ethnicity_count_pct(df)
    ethn.drop('lsoa', axis=1, inplace=True)
    ind = industry_count_pct(df)
    ind.drop('lsoa', axis=1, inplace=True)
    ns = ns_sec_count_pct(df)
    ns.drop('lsoa', axis=1, inplace=True)
    tenure = tenure_count_pct(df)
    tenure.drop('lsoa', axis=1, inplace=True)
    
    print(hh.dtypes)
    
    # join fn outputs to base using join on pincode
    base_df = base_df.merge(hh, how='inner', on='pincode').merge(age, how='inner', on='pincode')\
    .merge(veh, how='inner',on='pincode').merge(econ, how='inner', on='pincode').\
    merge(ethn, how='inner', on='pincode').merge(ind, how='inner', on='pincode').\
    merge(ns, how='inner', on='pincode').merge(tenure, how='inner', on='pincode')
    
    assert len(base_df) == len(df), 'Missing rows after joins'
    
    return base_df
    
    # return base_df

In [16]:
def scale_predictors(df1, df2, verbose=False):

    c01 = base(df1)
    c11 = base(df2)
    
    c01 = c01.iloc[:, 1:]
    c01.set_index('pincode', inplace=True)
    c11 = c11.iloc[:, 1:]
    c11.set_index('pincode', inplace=True)
    
    rss = preprocessing.RobustScaler(with_centering=False, quantile_range=(25.0,75.0))

    #  Train on 2001 data set
    rss.fit(c01)

    # Apply the same unit variance scaling to both years
    c01_scale = pd.DataFrame(data=rss.transform(c01), index=c01.index, columns=c01.columns)
    c11_scale = pd.DataFrame(data=rss.transform(c11), index=c11.index, columns=c11.columns)

    # Create new robust scaler for centering 
    # _without_ common scaling.
    rsc = preprocessing.RobustScaler(with_scaling=False)  

    # Centre independently
    c01_center = pd.DataFrame(data=rsc.fit_transform(c01_scale), index=c01.index, columns=c01.columns)  
    c11_center = pd.DataFrame(data=rsc.fit_transform(c11_scale), index=c11.index, columns=c11.columns)

    return c01_center, c11_center
    


In [17]:
# write to csv

out01, out11 = scale_predictors(census_raw_01, census_raw_11)

out01.to_csv('predictors_processed_01.csv')
out11.to_csv('predictors_processed_11.csv')


pincode                    object
no_emply_kids_pct         float64
no_emply_no_kids_pct      float64
kids_all_ages_pct         float64
kids_0_4_pct              float64
house_struct_other_pct    float64
dtype: object
pincode                    object
no_emply_kids_pct         float64
no_emply_no_kids_pct      float64
kids_all_ages_pct         float64
kids_0_4_pct              float64
house_struct_other_pct    float64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_df.rename(columns=lambda x: re.sub('.+households$', 'living_in_households', x), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_df.rename(columns=lambda x: re.sub('.+communal.+', 'living_in_communal', x), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_df.rename(columns=lambda x: re.sub('.+density.+', 'density', x), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documen