In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/demographics_1.csv")
df.dropna(how="any", axis=0, inplace=True)
# The following code is used for sampling and testing before on run the actual sample
# df = df.sample(frac=0.005, replace=False, random_state=1)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29110932 entries, 0 to 29115180
Data columns (total 16 columns):
PostCode          object
Ethnicity         object
Religion          object
Language          object
Immigration       object
Commute           object
Marital Status    object
Family Size       object
Dwelling          object
Ownership         object
Education         object
Profession        object
Income            int64
Family Income     float64
Age               object
Gender            object
dtypes: float64(1), int64(1), object(14)
memory usage: 3.7+ GB


## Select features that are potentially correlated with the feature (number of trades) from the aggregated data, and are easily aggregated as well (nominal features that only have two categories).
The candadites are:
1. Immigration -- nominal
2. Gender -- nominal
3. Ownership -- nominal
4. Dwelling -- nominal
5. Education -- nominal
6. ethnicity -- nominal
7. Commute -- nominal
8. Marital Status


9. Age -- ordinal


10. Income -- numeric
11. Family Size -- numeric
12. Family Income -- numeric

**Remove the following columns.**

In [4]:
removed_cols = ["Religion", "Language", "Profession"]
df.drop(removed_cols, inplace=True, axis=1)

In [5]:
index = ["PostCode"]

nominal_col = [
    "Immigration", "Gender", "Ownership", "Dwelling", 
    "Education", "Ethnicity", "Commute", "Marital Status",
]

ordinal_col = ["Age"]

numeric_col = ["Income", "Family Size", "Family Income"]

## Aggregate norminal data

In [6]:
df_nominal = df[nominal_col + index]
df_nominal = pd.get_dummies(df_nominal, columns=nominal_col)

In [7]:
df_nominal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29110932 entries, 0 to 29115180
Data columns (total 53 columns):
PostCode                                       object
Immigration_Immigrants                         uint8
Immigration_Non-immigrants                     uint8
Gender_Female                                  uint8
Gender_Male                                    uint8
Ownership_Band housing                         uint8
Ownership_Owned                                uint8
Ownership_Rented                               uint8
Dwelling_Apartment                             uint8
Dwelling_Other                                 uint8
Dwelling_Single-detached house                 uint8
Education_Bachelor's degree                    uint8
Education_College                              uint8
Education_Degree in medicine & related         uint8
Education_Doctorate                            uint8
Education_Master's degree                      uint8
Education_No certificate, diploma or

In [8]:
df_nominal_agg = df_nominal.groupby("PostCode").apply(lambda x: x.sum() / x.count())

In [9]:
df_nominal_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 793411 entries, A0A0B7 to Y1A7A4
Data columns (total 52 columns):
Immigration_Immigrants                         793411 non-null float64
Immigration_Non-immigrants                     793411 non-null float64
Gender_Female                                  793411 non-null float64
Gender_Male                                    793411 non-null float64
Ownership_Band housing                         793411 non-null float64
Ownership_Owned                                793411 non-null float64
Ownership_Rented                               793411 non-null float64
Dwelling_Apartment                             793411 non-null float64
Dwelling_Other                                 793411 non-null float64
Dwelling_Single-detached house                 793411 non-null float64
Education_Bachelor's degree                    793411 non-null float64
Education_College                              793411 non-null float64
Education_Degree in medicine & relate

## Preprocess and aggregate numeric data

In [10]:
df_numeric = df[numeric_col + index]

In [11]:
df_numeric.head()

Unnamed: 0,Income,Family Size,Family Income,PostCode
0,34560,1 person,150658.0,E1V6V5
1,57785,2 persons,78950.0,E1V6V5
2,44055,1 person,111262.0,A0A0B7
3,35954,2 persons,64925.0,A0A0B7
4,59829,2 persons,154926.0,E4T0C1


In [12]:
df_numeric["Family Size"] = df_numeric["Family Size"].str.split(" ", expand=True)[0]
df_numeric["Family Size"] = pd.to_numeric(df_numeric["Family Size"], errors="coerce")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
df_numeric_agg = df_numeric.groupby("PostCode").agg(
    {"Family Size": np.median, 
     "Income": np.mean,
     "Family Income": np.mean,
    }
)

In [14]:
df_numeric_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 793411 entries, A0A0B7 to Y1A7A4
Data columns (total 3 columns):
Family Size      793411 non-null float64
Income           793411 non-null float64
Family Income    793411 non-null float64
dtypes: float64(3)
memory usage: 24.2+ MB


## Preprocess and aggregate ordinal columns

In [15]:
df_ordinal = df[ordinal_col + index]

In [16]:
df_ordinal.head()

Unnamed: 0,Age,PostCode
0,25 to 29 years,E1V6V5
1,45 to 49 years,E1V6V5
2,30 to 34 years,A0A0B7
3,35 to 39 years,A0A0B7
4,60 to 64 years,E4T0C1


In [17]:
ordered_age = {
    "Age": {
        '20 to 24 years': 1,
        '25 to 29 years': 2, 
        '30 to 34 years': 3, 
        '35 to 39 years': 4,
        '40 to 44 years': 5,
        '45 to 49 years': 6,
        '50 to 54 years': 7, 
        '55 to 59 years': 8,
        '60 to 64 years': 9,
        '65 years and over': 10, 
    }
}
df_ordinal.replace(ordered_age, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regex=regex,


In [18]:
df_ordinal["Age"] = pd.to_numeric(df_ordinal["Age"], errors="coerce")
df_ordinal_agg = df_ordinal.groupby("PostCode").agg(
    {"Age": np.mean}
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
df_ordinal_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 793411 entries, A0A0B7 to Y1A7A4
Data columns (total 1 columns):
Age    793411 non-null float64
dtypes: float64(1)
memory usage: 12.1+ MB


In [20]:
df_agg = df_ordinal_agg.merge(df_numeric_agg, on='PostCode') \
                       .merge(df_nominal_agg, on='PostCode') \
                       .reset_index()

In [21]:
df_agg.head()

Unnamed: 0,PostCode,Age,Family Size,Income,Family Income,Immigration_Immigrants,Immigration_Non-immigrants,Gender_Female,Gender_Male,Ownership_Band housing,...,"Commute_Car, truck, van - as a passenger",Commute_Other method,Commute_Public transit,Commute_Walked,Marital Status_Divorced,Marital Status_Living common law,Marital Status_Married,Marital Status_Never married,Marital Status_Separated,Marital Status_Widowed
0,A0A0B7,3.5,1.5,40004.5,88093.5,0.0,1.0,0.5,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,A0A0C2,7.0,1.5,42080.0,104456.0,0.0,1.0,0.5,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0
2,A0A1A0,7.836066,1.0,40310.278689,93974.262295,0.04918,0.95082,0.47541,0.52459,0.0,...,0.04918,0.196721,0.032787,0.016393,0.04918,0.065574,0.590164,0.196721,0.032787,0.065574
3,A0A1B0,6.70625,1.0,63745.333333,131847.975,0.010417,0.989583,0.502083,0.497917,0.0,...,0.097917,0.0375,0.0375,0.039583,0.039583,0.064583,0.71875,0.127083,0.010417,0.039583
4,A0A1C0,6.19615,1.0,55183.269478,123971.666361,0.009166,0.990834,0.511457,0.488543,0.0,...,0.059578,0.028414,0.0,0.011916,0.019248,0.060495,0.778185,0.109074,0.013749,0.019248


In [23]:
df_agg.to_csv("data/demographics_6.csv", index=False)