In [2]:
import pandas as pd
import numpy as np

# Load data. 
Previously when loading data, Column (15) -- Family Income has mixed types. To remove this waring, spcifically define Family Income column as object type.

In [3]:
data_demo = pd.read_csv("data/demographics.csv", dtype={"Family Income": "object"})

## Explore demographics data

In [5]:
data_demo.head()

Unnamed: 0,ID,PostCode,Lat,Long,Province,Population Count,Age,Gender,Ethnicity,Religion,...,Immigration,Commute,Marital Status,Family Size,Dwelling,Ownership,Education,Profession,Income,Family Income
0,V8A3P8-0,V8A3P8,49.8627,-124.5191,BC,1,40 to 44 years,Female,British,No religious affiliation,...,Native,Car (driver),Married,4 persons,Single-detached house,Owned,Bachelor's degree or above,Accommodation and food services,"$20,000 to $29,999","$25,000 to $29,999"
1,V8A3P8-1,V8A3P8,49.8627,-124.5191,BC,1,55 to 59 years,Female,Northern European,Christian,...,Native,Car (driver),Divorced,1 person,Single-detached house,Rented,Apprenticeship or trades certificate or diploma,Health care and social assistance,"$10,000 to $19,999","$15,000 to $19,999"
2,V8A3P8-2,V8A3P8,49.8627,-124.5191,BC,1,60 to 64 years,Female,British,No religious affiliation,...,Native,Car (driver),Married,2 persons,Single-detached house,Owned,Secondary school,Retail trade,"$60,000 to $69,999","$90,000 to $99,999"
3,V8A3P8-3,V8A3P8,49.8627,-124.5191,BC,1,65 years and over,Female,French,Christian,...,Native,Car (driver),Widowed,3 persons,Single-detached house,Owned,College,Health care and social assistance,"$30,000 to $39,999","$30,000 to $34,999"
4,V8A3P8-4,V8A3P8,49.8627,-124.5191,BC,1,65 years and over,Female,North American,Christian,...,Native,Car (passenger),Widowed,3 persons,Single-detached house,Rented,"No certificate, diploma or degree",Retail trade,"Under $10,000 (including loss)","$45,000 to $49,999"


In [6]:
data_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29115178 entries, 0 to 29115177
Data columns (total 21 columns):
ID                  object
PostCode            object
Lat                 float64
Long                float64
Province            object
Population Count    int64
Age                 object
Gender              object
Ethnicity           object
Religion            object
Language            object
Immigration         object
Commute             object
Marital Status      object
Family Size         object
Dwelling            object
Ownership           object
Education           object
Profession          object
Income              object
Family Income       object
dtypes: float64(2), int64(1), object(18)
memory usage: 4.6+ GB


In [7]:
# Check if there are any missing values.
data_demo.isnull().values.any()

False

In [8]:
data_demo.Religion.unique()

array(['No religious affiliation', 'Christian', 'Muslim', 'Hindu',
       'Buddhist', 'Sikh', 'Traditional (Aboriginal) Spirituality',
       'Other religions', 'Jewish'], dtype=object)

In [9]:
data_demo.Ethnicity.unique()

array(['British', 'Northern European', 'French', 'North American',
       'Western European', 'Southern European', 'Eastern European',
       'Latin, Central and South American', 'Caribbean', 'African',
       'East and Southeast Asian origins', 'Middle Eastern', 'Aboriginal',
       'South Asian'], dtype=object)

In [11]:
data_demo.Province.unique()

array(['BC', 'QC', 'ON', 'NS', 'AB', 'SK', 'MB', 'NB', 'NL', 'PE', 'NT',
       'YT'], dtype=object)

In [12]:
data_demo.Language.unique()

array(['English', 'Non-official languages', 'French'], dtype=object)

In [13]:
data_demo.Immigration.unique()

array(['Native', 'Immigrants'], dtype=object)

In [14]:
data_demo.Commute.unique()

array(['Car (driver)', 'Car (passenger)', 'Public transit', 'Bicycle',
       'Walked', 'Other method'], dtype=object)

In [15]:
data_demo["Marital Status"].unique()

array(['Married', 'Divorced', 'Widowed', 'Living common law',
       'Never married', 'Separated'], dtype=object)

In [16]:
data_demo["Family Size"].unique()

array(['4 persons', '1 person', '2 persons', '3 persons',
       '5 or more persons'], dtype=object)

In [17]:
data_demo.Dwelling.unique()

array(['Single-detached house', 'Other', 'Apartment'], dtype=object)

In [18]:
data_demo.Ownership.unique()

array(['Owned', 'Rented'], dtype=object)

In [19]:
data_demo.Education.unique()

array(["Bachelor's degree or above",
       'Apprenticeship or trades certificate or diploma',
       'Secondary school', 'College', 'No certificate, diploma or degree'],
      dtype=object)

In [20]:
data_demo.Profession.unique()

array(['Accommodation and food services',
       'Health care and social assistance', 'Retail trade',
       'Manufacturing',
       'Administrative and support, waste management and remediation services',
       'Construction', 'Other services (except public administration)',
       'Finance and insurance', 'Public administration',
       'Educational services', 'Wholesale trade',
       'Transportation and warehousing',
       'Professional, scientific and technical services',
       'Real estate and rental and leasing',
       'Agriculture, forestry, fishing and hunting',
       'Arts, entertainment and recreation',
       'Mining, quarrying, and oil and gas extraction',
       'Information and cultural industries', 'Utilities',
       'Management of companies and enterprises'], dtype=object)

In [21]:
data_demo.Age.unique()

array(['40 to 44 years', '55 to 59 years', '60 to 64 years',
       '65 years and over', '25 to 29 years', '20 to 24 years',
       '30 to 34 years', '45 to 49 years', '50 to 54 years',
       '35 to 39 years'], dtype=object)

In [22]:
data_demo.Gender.unique()

array(['Female', 'Male'], dtype=object)

# There are four types of data:
1. Ordinal data: Family Size, Education.
2. Nominal data: Ethnicity,	Religion, Language, Immigration, Commute, Marital Status, Dwelling,	Ownership, Profession	
3. Numerical data: Income, Family Income
