# Tannis McCartney
## June 27, 2022
## Additional analysis/visuals for Tableau

#### Table of Contents
01 Import Libraries

02 Import Dataset

03 Population of Canada by Province

04 Affordable Housing by Province

05 After-tax income brackets

06 Age group

07 Household size

08 Low income flags

09 Disposable income flag

10 Ownership of dwelling

11 Affordable housing

12 Total household costs flag

# 01 Import libraries

In [1]:
# Import libraries for analysis
import pandas as pd
import numpy as np
import os

In [2]:
# Import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os

# 02 Import data

In [3]:
# Turn project folder path into a string
path = r'C:\Users\tmmcc\Google Drive\Data Analytics Bootcamp\6 Advanced Analytics and Dashboard Design\Nova Scotia Housing and Income'  

In [4]:
# Load 2018 Canadian Income Survey
CIS2018 = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'CIS2018_subset.pkl'))
CIS2018.head()

Unnamed: 0,household_id,person_id,sample_weight,province,age_group,gender,marital_status,after_tax_income,household_size,household_composition,...,disposable_income_below_mbm_flag,ownership_of_dwelling,monthly_mortgage_payment,monthly_condo_fee,monthly_rent,rent_subsidy_flag,core_housing_need_indicator,adult_food_security_status,child_food_security_status,household_food_security_status
0,2129,212901,208.9708,Ontario,25 to 29 years,Female,Common-law,44265.0,2,Two or more person household/One economic family,...,No,Owned by a member of the household,2000.0,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure
1,2129,212902,208.9708,Ontario,25 to 29 years,Male,Common-law,42985.0,2,Two or more person household/One economic family,...,No,Owned by a member of the household,2000.0,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure
2,2130,213001,1101.2217,British Columbia,45 to 49 years,Female,Common-law,82370.0,2,Two or more person household/One economic family,...,No,Not owned by a member of the household,,,2900.0,No,Not in core housing need,Food secure,Valid skip,Food secure
3,2130,213002,1101.2217,British Columbia,55 to 59 years,Female,Common-law,49105.0,2,Two or more person household/One economic family,...,No,Not owned by a member of the household,,,2900.0,No,Not in core housing need,Food secure,Valid skip,Food secure
4,2131,213101,165.3016,Saskatchewan,55 to 59 years,Female,Married,21000.0,2,Two or more person household/One economic family,...,No,Owned by a member of the household,1600.0,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure


In [5]:
# Check sum of sample_weight column: it should approximate the population of Canada
CIS2018['sample_weight'].sum()

36329666.0139

# 03 Population of Canada by Province

In [6]:
# Apply weight to population numbers and convert series to dataframe
pop = CIS2018.groupby('province')['sample_weight'].sum()
pop = pop.to_frame()

In [7]:
# Rename columns
pop = pop.rename(columns={'sample_weight':'population'})

In [8]:
# View data
round(pop)

Unnamed: 0_level_0,population
province,Unnamed: 1_level_1
Alberta,4295760.0
British Columbia,4747510.0
Manitoba,1260815.0
New Brunswick,737522.0
Newfoundland and Labrador,514230.0
Nova Scotia,934556.0
Ontario,14262158.0
Prince Edward Island,151906.0
Quebec,8333951.0
Saskatchewan,1091258.0


In [9]:
# Export weighted table 
pop.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'Canada_population.csv'))

# 04 Affordable Housing

In [10]:
df = CIS2018.loc[CIS2018['household_maj_income_flag']=='Yes']
df.shape

(40857, 24)

In [11]:
# Calculate 35% of after-tax income and divide by twelve
df['atinc35'] = df['after_tax_income'].apply(lambda x: x*.02917)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['atinc35'] = df['after_tax_income'].apply(lambda x: x*.02917)


In [12]:
# Calculate total housing costs (mortgage, mortgage + condo fees, or rent)
df['total_household_costs'] = df[['monthly_mortgage_payment', 'monthly_condo_fee', 'monthly_rent']].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_household_costs'] = df[['monthly_mortgage_payment', 'monthly_condo_fee', 'monthly_rent']].sum(axis=1)


In [13]:
# Check data
df.head(50)

Unnamed: 0,household_id,person_id,sample_weight,province,age_group,gender,marital_status,after_tax_income,household_size,household_composition,...,monthly_mortgage_payment,monthly_condo_fee,monthly_rent,rent_subsidy_flag,core_housing_need_indicator,adult_food_security_status,child_food_security_status,household_food_security_status,atinc35,total_household_costs
0,2129,212901,208.9708,Ontario,25 to 29 years,Female,Common-law,44265.0,2,Two or more person household/One economic family,...,2000.0,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure,1291.21005,2000.0
2,2130,213001,1101.2217,British Columbia,45 to 49 years,Female,Common-law,82370.0,2,Two or more person household/One economic family,...,,,2900.0,No,Not in core housing need,Food secure,Valid skip,Food secure,2402.7329,2900.0
5,2131,213102,165.3016,Saskatchewan,55 to 59 years,Male,Married,112195.0,2,Two or more person household/One economic family,...,1600.0,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure,3272.72815,1600.0
6,2132,213201,126.5388,Manitoba,35 to 39 years,Male,Married,70725.0,4,Two or more person household/One economic family,...,1700.0,,,Valid skip,Not in core housing need,Food secure,Food secure,Food secure,2063.04825,1700.0
11,2133,213302,119.6088,Nova Scotia,35 to 39 years,Female,Married,32160.0,3,Two or more person household/One economic family,...,400.0,,,Valid skip,Not in core housing need,Food secure,Food secure,Food secure,938.1072,400.0
14,2134,213402,152.1021,Nova Scotia,25 to 29 years,Male,Not stated,33325.0,3,Two or more person household/One economic family,...,1100.0,,,Valid skip,Not in core housing need,Food secure,Food secure,Food secure,972.09025,1100.0
16,2135,213501,967.0087,Ontario,65 to 69 years,Female,Not stated,59095.0,2,Two or more person household/One economic family,...,,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure,1723.80115,0.0
18,2136,213601,55.8415,New Brunswick,30 to 34 years,Female,Single (never married),64560.0,5,Two or more person household/One economic family,...,,,500.0,Not stated,Valid skip,Severely food insecure,Moderately food insecure,Severely food insecure,1883.2152,500.0
24,2137,213702,1329.0825,Ontario,30 to 34 years,Female,Married,65155.0,2,Two or more person household/One economic family,...,2100.0,,,Valid skip,Not in core housing need,Food secure,Valid skip,Food secure,1900.57135,2100.0
26,2138,213802,340.4773,Ontario,40 to 44 years,Female,Married,69590.0,3,Two or more person household/One economic family,...,3300.0,,,Valid skip,Not in core housing need,Food secure,Food secure,Food secure,2029.9403,3300.0


In [14]:
# Set affordable_housing_flag
df.loc[(df['total_household_costs'] <= df['atinc35']), 'affordable_housing_flag'] = 'Yes'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[(df['total_household_costs'] <= df['atinc35']), 'affordable_housing_flag'] = 'Yes'


In [15]:
# Set non-affordable housing flag
df.loc[(df['total_household_costs'] > df['atinc35']), 'affordable_housing_flag'] = 'No'

In [16]:
# Check frequency table for affordable_housing_flag
df['affordable_housing_flag'].value_counts()

Yes    33356
No      7501
Name: affordable_housing_flag, dtype: int64

In [17]:
# Apply weight to affordable_housing_costs
df1 = df.groupby('affordable_housing_flag')['sample_weight'].sum()
round(df1.head())

affordable_housing_flag
No      3705641.0
Yes    11044703.0
Name: sample_weight, dtype: float64

In [18]:
# Group by province
grouped = df.groupby(df.province)

In [19]:
AB = grouped.get_group("Alberta")

In [20]:
BC = grouped.get_group("British Columbia")

In [21]:
SK = grouped.get_group("Saskatchewan")

In [22]:
MB = grouped.get_group("Manitoba")

In [23]:
ON = grouped.get_group("Ontario")

In [24]:
QE = grouped.get_group("Quebec")

In [25]:
NB = grouped.get_group("New Brunswick")

In [26]:
NS = grouped.get_group("Nova Scotia")

In [27]:
PE = grouped.get_group("Prince Edward Island")

In [28]:
NL = grouped.get_group("Newfoundland and Labrador")

In [29]:
# Calculate weighted totals of affordable housing flag

In [30]:
AB_ahf = AB.groupby('affordable_housing_flag')['sample_weight'].sum()
AB_ahf = AB_ahf.rename('Alberta')
round(AB_ahf.head())

affordable_housing_flag
No      370504.0
Yes    1242276.0
Name: Alberta, dtype: float64

In [31]:
BC_ahf = BC.groupby('affordable_housing_flag')['sample_weight'].sum()
BC_ahf = BC_ahf.rename('British Columbia')
round(BC_ahf.head())

affordable_housing_flag
No      630707.0
Yes    1333746.0
Name: British Columbia, dtype: float64

In [32]:
MB_ahf = MB.groupby('affordable_housing_flag')['sample_weight'].sum()
MB_ahf = MB_ahf.rename('Manitoba')
round(MB_ahf.head())

affordable_housing_flag
No     118262.0
Yes    380639.0
Name: Manitoba, dtype: float64

In [33]:
SK_ahf = SK.groupby('affordable_housing_flag')['sample_weight'].sum()
SK_ahf = SK_ahf.rename('Saskatchewan')
round(SK_ahf.head())

affordable_housing_flag
No      88074.0
Yes    354245.0
Name: Saskatchewan, dtype: float64

In [34]:
ON_ahf = ON.groupby('affordable_housing_flag')['sample_weight'].sum()
ON_ahf = ON_ahf.rename('Ontario')
round(ON_ahf.head())

affordable_housing_flag
No     1738628.0
Yes    3813378.0
Name: Ontario, dtype: float64

In [35]:
QE_ahf = QE.groupby('affordable_housing_flag')['sample_weight'].sum()
QE_ahf = QE_ahf.rename('Quebec')
round(QE_ahf.head())

affordable_housing_flag
No      615417.0
Yes    3049299.0
Name: Quebec, dtype: float64

In [36]:
NB_ahf = NB.groupby('affordable_housing_flag')['sample_weight'].sum()
NB_ahf = NB_ahf.rename('New Brunswick')
round(NB_ahf.head())

affordable_housing_flag
No      33691.0
Yes    289520.0
Name: New Brunswick, dtype: float64

In [37]:
NS_ahf = NS.groupby('affordable_housing_flag')['sample_weight'].sum()
NS_ahf = NS_ahf.rename('Nova Scotia')
round(NS_ahf.head())

affordable_housing_flag
No      72963.0
Yes    337878.0
Name: Nova Scotia, dtype: float64

In [38]:
PE_ahf = PE.groupby('affordable_housing_flag')['sample_weight'].sum()
PE_ahf = PE_ahf.rename('Prince Edward Island')
round(PE_ahf.head())

affordable_housing_flag
No      8874.0
Yes    53418.0
Name: Prince Edward Island, dtype: float64

In [39]:
NL_ahf = NL.groupby('affordable_housing_flag')['sample_weight'].sum()
NL_ahf = NL_ahf.rename('Newfoundland and Labrador')
round(NL_ahf.head())

affordable_housing_flag
No      28520.0
Yes    190305.0
Name: Newfoundland and Labrador, dtype: float64

In [40]:
ahf_pop = pd.concat([AB_ahf, BC_ahf, SK_ahf, MB_ahf, ON_ahf, QE_ahf, NB_ahf, NS_ahf, PE_ahf, NL_ahf], axis=1).reset_index()
round(ahf_pop.head())

Unnamed: 0,affordable_housing_flag,Alberta,British Columbia,Saskatchewan,Manitoba,Ontario,Quebec,New Brunswick,Nova Scotia,Prince Edward Island,Newfoundland and Labrador
0,No,370504.0,630707.0,88074.0,118262.0,1738628.0,615417.0,33691.0,72963.0,8874.0,28520.0
1,Yes,1242276.0,1333746.0,354245.0,380639.0,3813378.0,3049299.0,289520.0,337878.0,53418.0,190305.0


In [41]:
# Transpose dataframe
ahf_pop_T = ahf_pop.transpose().reset_index()
ahf_pop_T.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   11 non-null     object
 1   0       11 non-null     object
 2   1       11 non-null     object
dtypes: object(3)
memory usage: 392.0+ bytes


In [42]:
ahf_pop_T = ahf_pop_T.rename(columns={
    'index': 'province',
    0 : 'not affordable',
    1 : 'affordable'
})

ahf_pop_T

Unnamed: 0,province,not affordable,affordable
0,affordable_housing_flag,No,Yes
1,Alberta,370504.4949,1242275.5083
2,British Columbia,630707.238,1333745.763
3,Saskatchewan,88073.8759,354245.1225
4,Manitoba,118261.8779,380639.1179
5,Ontario,1738627.5505,3813378.4483
6,Quebec,615417.3774,3049298.6221
7,New Brunswick,33691.4132,289519.5898
8,Nova Scotia,72963.0334,337877.9689
9,Prince Edward Island,8873.825,53418.1746


In [43]:
# Drop first row
ahf_pop_T = ahf_pop_T.iloc[1: , :]
ahf_pop_T

Unnamed: 0,province,not affordable,affordable
1,Alberta,370504.4949,1242275.5083
2,British Columbia,630707.238,1333745.763
3,Saskatchewan,88073.8759,354245.1225
4,Manitoba,118261.8779,380639.1179
5,Ontario,1738627.5505,3813378.4483
6,Quebec,615417.3774,3049298.6221
7,New Brunswick,33691.4132,289519.5898
8,Nova Scotia,72963.0334,337877.9689
9,Prince Edward Island,8873.825,53418.1746
10,Newfoundland and Labrador,28520.0973,190304.903


In [44]:
# Export dataframe to Excel
ahf_pop_T.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'canada_affordable_housing.csv'), index=False)

# 05 After-tax income brackets

In [45]:
# Set income brackets
NS.loc[(NS['after_tax_income'] < 40000), 'income_bracket'] = '< 40k'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS.loc[(NS['after_tax_income'] < 40000), 'income_bracket'] = '< 40k'


In [46]:
NS.loc[((NS['after_tax_income'] >= 40000) & (NS['after_tax_income'] < 60000)), 'income_bracket'] = '40k - 60k'

In [47]:
NS.loc[((NS['after_tax_income'] >= 60000) & (NS['after_tax_income'] < 80000)), 'income_bracket'] = '60k - 80k'

In [48]:
NS.loc[((NS['after_tax_income'] >= 80000) & (NS['after_tax_income'] < 100000)), 'income_bracket'] = '80k - 100k'

In [49]:
NS.loc[((NS['after_tax_income'] >= 100000) & (NS['after_tax_income'] < 200000)), 'income_bracket'] = '100k - 200k'

In [50]:
NS.loc[(NS['after_tax_income'] > 200000), 'income_bracket'] = '> 200k'

In [51]:
# View frequency table of income_flag
NS['income_bracket'].value_counts()

< 40k          1134
40k - 60k       655
60k - 80k       335
80k - 100k      144
100k - 200k      89
> 200k            9
Name: income_bracket, dtype: int64

In [52]:
# Calculate weighted totals of income_bracket
NS_wt_income = NS.groupby('income_bracket')['sample_weight'].sum()
round(NS_wt_income.head())

income_bracket
100k - 200k     15320.0
40k - 60k      114366.0
60k - 80k       57288.0
80k - 100k      25808.0
< 40k          196041.0
Name: sample_weight, dtype: float64

In [53]:
# Convert series to dataframe
NS_wt_income = pd.concat([NS_wt_income], axis=1).reset_index()
NS_wt_income = NS_wt_income.rename(columns={'sample_weight':'count'})
round(NS_wt_income.head())

Unnamed: 0,income_bracket,count
0,100k - 200k,15320.0
1,40k - 60k,114366.0
2,60k - 80k,57288.0
3,80k - 100k,25808.0
4,< 40k,196041.0


In [54]:
# Export series
NS_wt_income.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_income_bracket.csv'), index=False)

# 06 Age Group

In [55]:
# Calculate weighted totals of age_group
NS_wt_age = NS.groupby('age_group')['sample_weight'].sum()
round(NS_wt_age.head())

age_group
18 to 24 years    15299.0
25 to 29 years    27645.0
30 to 34 years    36957.0
35 to 39 years    30407.0
40 to 44 years    27864.0
Name: sample_weight, dtype: float64

In [56]:
# Convert series to dataframe
NS_wt_age = pd.concat([NS_wt_age], axis=1).reset_index()
NS_wt_age = NS_wt_age.rename(columns={'sample_weight':'count'})
round(NS_wt_age.head())

Unnamed: 0,age_group,count
0,18 to 24 years,15299.0
1,25 to 29 years,27645.0
2,30 to 34 years,36957.0
3,35 to 39 years,30407.0
4,40 to 44 years,27864.0


In [57]:
# Export series to dataframe
NS_wt_age.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_age_groups.csv'), index=False)

# 07 Household Size

In [58]:
# Calculate weighted totals of household size
NS_wt_hh_size = NS.groupby('household_size')['sample_weight'].sum()
round(NS_wt_hh_size.head())

household_size
1    121870.0
2    162789.0
3     50493.0
4     50871.0
5     17921.0
Name: sample_weight, dtype: float64

In [59]:
# Convert series to dataframe
NS_wt_hh_size = pd.concat([NS_wt_hh_size], axis=1).reset_index()
NS_wt_hh_size = NS_wt_hh_size.rename(columns={'sample_weight':'count'})
round(NS_wt_hh_size.head())

Unnamed: 0,household_size,count
0,1,121870.0
1,2,162789.0
2,3,50493.0
3,4,50871.0
4,5,17921.0


In [60]:
# Export series to dataframe
NS_wt_hh_size.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_household_size.csv'), index=False)

# 08 Before- and After-tax low-income flags

In [61]:
# Calculate weighted totals of low-income flag
NS_wt_atli_flag = NS.groupby('after_tax_low_income_flag')['sample_weight'].sum()
round(NS_wt_atli_flag.head())

after_tax_low_income_flag
No     377539.0
Yes     33302.0
Name: sample_weight, dtype: float64

In [62]:
# Convert series to dataframe
NS_wt_atli_flag = pd.concat([NS_wt_atli_flag], axis=1).reset_index()
NS_wt_atli_flag = NS_wt_atli_flag.rename(columns={'sample_weight':'count'})
round(NS_wt_atli_flag.head())

Unnamed: 0,after_tax_low_income_flag,count
0,No,377539.0
1,Yes,33302.0


In [63]:
# Export series to dataframe
NS_wt_atli_flag.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_atinc_low.csv'), index=False)

In [64]:
# Calculate weighted totals of low-income flag
NS_wt_btli_flag = NS.groupby('before_tax_low_income_flag')['sample_weight'].sum()
round(NS_wt_btli_flag.head())

before_tax_low_income_flag
No     360769.0
Yes     50072.0
Name: sample_weight, dtype: float64

In [65]:
# Convert series to dataframe
NS_wt_btli_flag = pd.concat([NS_wt_btli_flag], axis=1).reset_index()
NS_wt_btli_flag = NS_wt_btli_flag.rename(columns={'sample_weight':'count'})
round(NS_wt_btli_flag.head())

Unnamed: 0,before_tax_low_income_flag,count
0,No,360769.0
1,Yes,50072.0


In [66]:
# Export series to dataframe
NS_wt_btli_flag.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_btinc_low.csv'), index=False)

# 09 Disposable income... flag

In [67]:
# Calculate weighted totals of disposable_income... flag
NS_wt_disposable_flag = NS.groupby('disposable_income_below_mbm_flag')['sample_weight'].sum()
round(NS_wt_disposable_flag.head())

disposable_income_below_mbm_flag
No     347846.0
Yes     62995.0
Name: sample_weight, dtype: float64

In [68]:
# Convert series to dataframe
NS_wt_disposable_flag = pd.concat([NS_wt_disposable_flag], axis=1).reset_index()
NS_wt_disposable_flag = NS_wt_disposable_flag.rename(columns={'sample_weight':'count'})
round(NS_wt_disposable_flag.head())

Unnamed: 0,disposable_income_below_mbm_flag,count
0,No,347846.0
1,Yes,62995.0


In [69]:
# Export series to dataframe
NS_wt_disposable_flag.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_mbm.csv'), index=False)

# 10 Ownership of dwelling

In [70]:
# Calculate weighted totals of ownership of dwelling
NS_wt_ownership = NS.groupby('ownership_of_dwelling')['sample_weight'].sum()
round(NS_wt_ownership.head())

ownership_of_dwelling
Not owned by a member of the household    126539.0
Owned by a member of the household        284302.0
Name: sample_weight, dtype: float64

In [71]:
# Convert series to dataframe
NS_wt_ownership = pd.concat([NS_wt_ownership], axis=1).reset_index()
NS_wt_ownership = NS_wt_ownership.rename(columns={'sample_weight':'count'})
round(NS_wt_ownership.head())

Unnamed: 0,ownership_of_dwelling,count
0,Not owned by a member of the household,126539.0
1,Owned by a member of the household,284302.0


In [72]:
# Export series to dataframe
NS_wt_ownership.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_ownership.csv'), index=False)

# 11 Affordable Housing

In [73]:
# Calculate weighted totals of affordable housing flag
NS_ahf2 = NS.groupby('affordable_housing_flag')['sample_weight'].sum()
round(NS_ahf2.head())

affordable_housing_flag
No      72963.0
Yes    337878.0
Name: sample_weight, dtype: float64

In [74]:
# Convert series to dataframe
NS_ahf2 = pd.concat([NS_ahf2], axis=1).reset_index()
NS_ahf2 = NS_ahf2.rename(columns={'sample_weight':'count'})
round(NS_ahf2.head())

Unnamed: 0,affordable_housing_flag,count
0,No,72963.0
1,Yes,337878.0


In [75]:
# Export series to dataframe
NS_ahf2.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_affordable.csv'), index=False)

# 12 Total household costs flag

In [76]:
# Look at total household costs
NS['total_household_costs'].min()

0.0

In [77]:
# Look at total household costs
NS['total_household_costs'].max()

2900.0

In [78]:
# Set household costs flag brackets
NS.loc[(NS['total_household_costs'] == 0), 'housing_costs_flag'] = 'House paid in full'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NS.loc[(NS['total_household_costs'] == 0), 'housing_costs_flag'] = 'House paid in full'


In [79]:
NS.loc[((NS['total_household_costs'] > 0) & (NS['total_household_costs'] < 800)), 'housing_costs_flag'] = '< $800/month'

In [80]:
NS.loc[((NS['total_household_costs'] >= 800) & (NS['total_household_costs'] < 1600)), 'housing_costs_flag'] = '$800 - $1600 /month'

In [81]:
NS.loc[((NS['total_household_costs'] >= 1600) & (NS['total_household_costs'] < 2400)), 'housing_costs_flag'] = '$1600 - $2400 /month'

In [82]:
NS.loc[(NS['total_household_costs'] > 2400), 'housing_costs_flag'] = '> $2400 /month'

In [83]:
# View frequency table of income_flag
NS['housing_costs_flag'].value_counts()

House paid in full      864
$800 - $1600 /month     712
< $800/month            648
$1600 - $2400 /month    118
> $2400 /month           19
Name: housing_costs_flag, dtype: int64

In [84]:
# Calculate weighted totals of housing_costs_flag
NS_housing_flag = NS.groupby('housing_costs_flag')['sample_weight'].sum()
round(NS_housing_flag.head())

housing_costs_flag
$1600 - $2400 /month     25721.0
$800 - $1600 /month     137426.0
< $800/month            106793.0
> $2400 /month            3920.0
House paid in full      136134.0
Name: sample_weight, dtype: float64

In [85]:
# Convert series to dataframe
NS_housing_flag = pd.concat([NS_housing_flag], axis=1).reset_index()
NS_housing_flag = NS_housing_flag.rename(columns={'sample_weight':'count'})
round(NS_housing_flag.head())

Unnamed: 0,housing_costs_flag,count
0,$1600 - $2400 /month,25721.0
1,$800 - $1600 /month,137426.0
2,< $800/month,106793.0
3,> $2400 /month,3920.0
4,House paid in full,136134.0


In [86]:
# Export series to dataframe
NS_housing_flag.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS_housing.csv'), index=False)

In [88]:
# Export NS dataframe
NS.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'NS.csv'), index=False)