In [1]:
import pandas as pd
import numpy as np

# Load data. 
Previously when loading data, Column (15) -- Family Income has mixed types. To remove this waring, spcifically define Family Income column as object type.

In [2]:
data_demo = pd.read_csv("data/demographics", dtype={"Family Income": "object"})

## Explore demographics data

In [5]:
data_demo.head()

Unnamed: 0,ID,PostCode,Demographics,Ethnicity,Religion,Language,Immigration,Commute,Marital Status,Family Size,Dwelling,Ownership,Education,Profession,Income,Family Income
0,E1V6V5-0,E1V6V5,25 to 29 years-Female,North American,Christian,English,Non-immigrants,"Car, truck, van - as a driver",Married,1 person,Single-detached house,Owned,Postsecondary,62 Health care and social assistance,34560,150658
1,E1V6V5-1,E1V6V5,45 to 49 years-Male,British,Christian,French,Non-immigrants,"Car, truck, van - as a driver",Married,2 persons,Single-detached house,Owned,Postsecondary,"56 Administrative and support, waste managemen...",57785,78950
2,A0A0B7-0,A0A0B7,30 to 34 years-Female,British,Christian,English and non-official language,Non-immigrants,"Car, truck, van - as a driver",Married,1 person,Single-detached house,Owned,College,31-33 Manufacturing,44055,111262
3,A0A0B7-1,A0A0B7,35 to 39 years-Male,North American,Christian,English and non-official language,Non-immigrants,"Car, truck, van - as a driver",Married,2 persons,Single-detached house,Owned,Postsecondary,44-45 Retail trade,35954,64925
4,E4T0C1-0,E4T0C1,60 to 64 years-Female,North American,Christian,French,Non-immigrants,"Car, truck, van - as a driver",Married,2 persons,Single-detached house,Owned,Secondary school,"21 Mining, quarrying, and oil and gas extraction",59829,154926


In [4]:
data_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29115181 entries, 0 to 29115180
Data columns (total 16 columns):
ID                object
PostCode          object
Demographics      object
Ethnicity         object
Religion          object
Language          object
Immigration       object
Commute           object
Marital Status    object
Family Size       object
Dwelling          object
Ownership         object
Education         object
Profession        object
Income            int64
Family Income     object
dtypes: int64(1), object(15)
memory usage: 3.5+ GB


In [5]:
# Check if there are any missing values.
data_demo.isnull().values.any()

False

In [6]:
data_demo.Religion.unique()

array(['Christian', 'No religious affiliation', 'Muslim', 'Buddhist',
       'Traditional (Aboriginal) Spirituality', 'Hindu', 'Jewish',
       'Other religions', 'Sikh'], dtype=object)

In [7]:
data_demo.Ethnicity.unique()

array(['North American', 'British', 'French', 'Western European',
       'Latin, Central and South American', 'Eastern European',
       'Aboriginal', 'Other European origins', 'Southern European',
       'Middle Eastern', 'African', 'Chinese', 'Caribbean', 'Oceania',
       'Northern European', 'Vietnamese', 'South Asian', 'Malaysian',
       'Korean', 'Japanese', 'Filipino', 'Indonesian'], dtype=object)

In [8]:
data_demo.Demographics.unique()

array(['25 to 29 years-Female', '45 to 49 years-Male',
       '30 to 34 years-Female', '35 to 39 years-Male',
       '60 to 64 years-Female', '65 years and over-Female',
       '20 to 24 years-Female', '35 to 39 years-Female',
       '40 to 44 years-Female', '45 to 49 years-Female',
       '50 to 54 years-Female', '55 to 59 years-Female',
       '20 to 24 years-Male', '25 to 29 years-Male',
       '30 to 34 years-Male', '40 to 44 years-Male',
       '50 to 54 years-Male', '55 to 59 years-Male',
       '60 to 64 years-Male', '65 years and over-Male'], dtype=object)

In [9]:
data_demo.Language.unique()

array(['English', 'French', 'English and non-official language',
       'Non-official language',
       'English, French and non-official language', 'English and French',
       'French and non-official language'], dtype=object)

In [10]:
data_demo.Immigration.unique()

array(['Non-immigrants', 'Immigrants'], dtype=object)

In [11]:
data_demo.Commute.unique()

array(['Car, truck, van - as a driver',
       'Car, truck, van - as a passenger', 'Public transit',
       'Other method', 'Walked', 'Bicycle'], dtype=object)

In [12]:
data_demo["Marital Status"].unique()

array(['Married', 'Living common law', 'Never married', 'Separated',
       'Divorced', 'Widowed'], dtype=object)

In [13]:
data_demo["Family Size"].unique()

array(['1 person', '2 persons', '4 persons', '5 or more persons',
       '3 persons'], dtype=object)

In [14]:
data_demo.Dwelling.unique()

array(['Single-detached house', 'Other', 'Apartment'], dtype=object)

In [15]:
data_demo.Ownership.unique()

array(['Owned', 'Rented', 'Band housing'], dtype=object)

In [16]:
data_demo.Education.unique()

array(['Postsecondary', 'College', 'Secondary school', "Master's degree",
       "Bachelor's degree", 'No certificate, diploma or degree',
       'Doctorate', 'Degree in medicine & related'], dtype=object)

In [17]:
data_demo.Profession.unique()

array(['62 Health care and social assistance',
       '56 Administrative and support, waste management and remediation services',
       '31-33 Manufacturing', '44-45 Retail trade',
       '21 Mining, quarrying, and oil and gas extraction',
       '61 Educational services',
       '81 Other services (except public administration)',
       '72 Accommodation and food services',
       '53 Real estate and rental and leasing',
       '54 Professional, scientific and technical services',
       '11 Agriculture, forestry, fishing and hunting',
       '51 Information and cultural industries', '41 Wholesale trade',
       '22 Utilities', '52 Finance and insurance', '23 Construction',
       '91 Public administration', '48-49 Transportation and warehousing',
       '71 Arts, entertainment and recreation',
       '55 Management of companies and enterprises'], dtype=object)

# There are four types of data:
1. Ordinal data: Family Size, Education.
2. Nominal data: Ethnicity,	Religion, Language, Immigration, Commute, Marital Status, Dwelling,	Ownership, Profession	
3. Numerical data: Income, Family Income
4. Mixed type: Demographics -- it can be divided into ordinal data "Age" and categorical data "Gender".

**first, divide the demographics feature into ordinal and categorical data**

In [3]:
data_demo[["Age", "Gender"]] = data_demo.Demographics.str.split("-", expand=True)
data_demo.drop(columns=["ID", "Demographics"], inplace=True)

**Convert "Family Income" type from object to int**

In [5]:
data_demo["Family Income"] = pd.to_numeric(data_demo["Family Income"], errors="coerce")

# Currently, it almost reached the memory limit (32 G), so output the datafram into csv file (demographics_1.csv), and restart from there.

In [7]:
data_demo.to_csv("data/demographics_1.csv", index=False)