In [14]:
import pandas as pd
import numpy as np

In [2]:
data_demo = pd.read_csv("data/demographics_1.csv")

In [3]:
data_demo.head()

Unnamed: 0,PostCode,Ethnicity,Religion,Language,Immigration,Commute,Marital Status,Family Size,Dwelling,Ownership,Education,Profession,Income,Family Income,Age,Gender
0,E1V6V5,North American,Christian,English,Non-immigrants,"Car, truck, van - as a driver",Married,1 person,Single-detached house,Owned,Postsecondary,62 Health care and social assistance,34560,150658.0,25 to 29 years,Female
1,E1V6V5,British,Christian,French,Non-immigrants,"Car, truck, van - as a driver",Married,2 persons,Single-detached house,Owned,Postsecondary,"56 Administrative and support, waste managemen...",57785,78950.0,45 to 49 years,Male
2,A0A0B7,British,Christian,English and non-official language,Non-immigrants,"Car, truck, van - as a driver",Married,1 person,Single-detached house,Owned,College,31-33 Manufacturing,44055,111262.0,30 to 34 years,Female
3,A0A0B7,North American,Christian,English and non-official language,Non-immigrants,"Car, truck, van - as a driver",Married,2 persons,Single-detached house,Owned,Postsecondary,44-45 Retail trade,35954,64925.0,35 to 39 years,Male
4,E4T0C1,North American,Christian,French,Non-immigrants,"Car, truck, van - as a driver",Married,2 persons,Single-detached house,Owned,Secondary school,"21 Mining, quarrying, and oil and gas extraction",59829,154926.0,60 to 64 years,Female


In [5]:
data_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29115181 entries, 0 to 29115180
Data columns (total 16 columns):
PostCode          object
Ethnicity         object
Religion          object
Language          object
Immigration       object
Commute           object
Marital Status    object
Family Size       object
Dwelling          object
Ownership         object
Education         object
Profession        object
Income            int64
Family Income     float64
Age               object
Gender            object
dtypes: float64(1), int64(1), object(14)
memory usage: 3.5+ GB


## Select features that are potentially correlated with the feature (number of trades) from the aggregated data, and are easily aggregated as well (nominal features that only have two categories).
The candadites are:
1. Immigration -- nominal
2. Gender -- nominal
3. Ownership -- nominal
4. Dwelling -- nominal


5. Education -- ordianl
6. Age -- ordinal


7. Income -- numeric
8. Family Size -- numeric
9. Family Income -- numeric

In [6]:
removed_col = ["Ethnicity", "Religion", "Language", "Commute", "Marital Status", "Profession"]
data_demo.drop(removed_col, inplace=True, axis=1)

## Aggregate norminal data

In [9]:
def aggregate_group(x):
    d = {}
    d["Immigrants Percentage"] = x["Immigration"].value_counts()["Immigrants"] / x["Immigration"].count()\
        if "Immigrants" in x["Immigration"].value_counts() else 0
    d["Owner Percentage"] = x["Ownership"].value_counts()["Owned"] / x["Ownership"].count()\
        if "Owned" in x["Ownership"].value_counts() else 0
    d["House Percentage"] = x["Dwelling"].value_counts()["Single-detached house"] / x["Dwelling"].count()\
        if "Single-detached house" in x["Dwelling"].value_counts() else 0
    d["Male Percentage"] = x["Gender"].value_counts()["Male"] / x["Gender"].count()\
        if "Male" in x["Gender"].value_counts() else 0
    return pd.Series(d)

%time
agg_df_nominal = data_demo.groupby("PostCode").apply(aggregate_group)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.53 µs


In [10]:
agg_df_nominal.head()

Unnamed: 0_level_0,Immigrants Percentage,Owner Percentage,House Percentage,Male Percentage
PostCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A0A0B7,0.0,1.0,1.0,0.5
A0A0C2,0.0,1.0,1.0,0.5
A0A1A0,0.04918,0.885246,1.0,0.52459
A0A1B0,0.010417,0.795833,0.947917,0.497917
A0A1C0,0.009166,0.905591,0.963336,0.488543


In [12]:
# Save the dataframe to a file to avoid re-processing.
agg_df_nominal.to_csv("data/demographics_2.csv", index=True)

## Preprocess and aggregate numeric data.

In [7]:
data_demo["Family Size"] = data_demo["Family Size"].str.split(" ", expand=True)[0]
data_demo["Family Size"] = pd.to_numeric(data_demo["Family Size"], errors="coerce")

In [8]:
data_demo.head()

Unnamed: 0,PostCode,Immigration,Family Size,Dwelling,Ownership,Education,Income,Family Income,Age,Gender
0,E1V6V5,Non-immigrants,1,Single-detached house,Owned,Postsecondary,34560,150658.0,25 to 29 years,Female
1,E1V6V5,Non-immigrants,2,Single-detached house,Owned,Postsecondary,57785,78950.0,45 to 49 years,Male
2,A0A0B7,Non-immigrants,1,Single-detached house,Owned,College,44055,111262.0,30 to 34 years,Female
3,A0A0B7,Non-immigrants,2,Single-detached house,Owned,Postsecondary,35954,64925.0,35 to 39 years,Male
4,E4T0C1,Non-immigrants,2,Single-detached house,Owned,Secondary school,59829,154926.0,60 to 64 years,Female


In [20]:
agg_df_numeric = data_demo.groupby("PostCode").agg(
    {"Family Size": np.median, 
     "Income": np.mean,
     "Family Income": np.mean,
    }
)

In [21]:
agg_df_numeric.head()

Unnamed: 0_level_0,Family Size,Income,Family Income
PostCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A0B7,1.5,40004.5,88093.5
A0A0C2,1.5,42080.0,104456.0
A0A1A0,1.0,40310.278689,93974.262295
A0A1B0,1.0,63745.333333,131847.975
A0A1C0,1.0,55183.269478,123971.666361


## Preprocess and aggregate ordinal columns

In [25]:
ordered_education = {
    "Education": {
        'No certificate, diploma or degree': 0,
        'Secondary school': 1,
        'Postsecondary': 2,
        'College': 3,
        "Bachelor's degree": 4,
        "Master's degree": 5,
        'Degree in medicine & related': 6,
        'Doctorate': 7,
    }
}
    
data_demo.replace(ordered_education, inplace=True)

In [30]:
ordered_age = {
    "Age": {
        '20 to 24 years': 1,
        '25 to 29 years': 2, 
        '30 to 34 years': 3, 
        '35 to 39 years': 4,
        '40 to 44 years': 5,
        '45 to 49 years': 6,
        '50 to 54 years': 7, 
        '55 to 59 years': 8,
        '60 to 64 years': 9,
        '65 years and over:': 10, 
    }
}
data_demo.replace(ordered_age, inplace=True)

In [31]:
data_demo.head()

Unnamed: 0,PostCode,Immigration,Family Size,Dwelling,Ownership,Education,Income,Family Income,Age,Gender
0,E1V6V5,Non-immigrants,1,Single-detached house,Owned,2,34560,150658.0,2,Female
1,E1V6V5,Non-immigrants,2,Single-detached house,Owned,2,57785,78950.0,6,Male
2,A0A0B7,Non-immigrants,1,Single-detached house,Owned,3,44055,111262.0,3,Female
3,A0A0B7,Non-immigrants,2,Single-detached house,Owned,2,35954,64925.0,4,Male
4,E4T0C1,Non-immigrants,2,Single-detached house,Owned,1,59829,154926.0,9,Female


In [33]:
data_demo["Age"] = pd.to_numeric(data_demo["Age"], errors="coerce")
data_demo["Education"] = pd.to_numeric(data_demo["Education"], errors="coerce")
agg_df_ordinal = data_demo.groupby("PostCode").agg(
    {"Age": np.mean, 
     "Education": np.mean,
    }
)

In [34]:
agg_df_ordinal.head()

Unnamed: 0_level_0,Age,Education
PostCode,Unnamed: 1_level_1,Unnamed: 2_level_1
A0A0B7,3.5,2.5
A0A0C2,7.0,1.5
A0A1A0,6.615385,1.52459
A0A1B0,5.632597,2.441667
A0A1C0,5.065398,1.95967


## Join all dataframes

In [35]:
agg_df_nominal = pd.read_csv("data/demographics_2.csv")

In [37]:
agg_df = agg_df_nominal.merge(agg_df_numeric,on='PostCode').merge(agg_df_ordinal,on='PostCode')

In [38]:
agg_df.head()

Unnamed: 0,PostCode,Immigrants Percentage,Owner Percentage,House Percentage,Male Percentage,Family Size,Income,Family Income,Age,Education
0,A0A0B7,0.0,1.0,1.0,0.5,1.5,40004.5,88093.5,3.5,2.5
1,A0A0C2,0.0,1.0,1.0,0.5,1.5,42080.0,104456.0,7.0,1.5
2,A0A1A0,0.04918,0.885246,1.0,0.52459,1.0,40310.278689,93974.262295,6.615385,1.52459
3,A0A1B0,0.010417,0.795833,0.947917,0.497917,1.0,63745.333333,131847.975,5.632597,2.441667
4,A0A1C0,0.009166,0.905591,0.963336,0.488543,1.0,55183.269478,123971.666361,5.065398,1.95967


In [39]:
agg_df.to_csv("data/demographics_3.csv")