# Feature Selection with Pearson Correlation

In [11]:
# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Dataset:  2017nat_housing_survey1.csv

Source: American Housing Survey by US Census Bureau

https://www.census.gov/programs-surveys/ahs/data/2017/ahs-2017-public-use-file--puf-/2017-ahs-metropolitan-puf-microdata.html


In [12]:
# File to Load 
file_to_load = "dataset/2017nat_housing_survey1.csv"

# Read Purchasing File and store into Pandas data frame
housing_data = pd.read_csv(file_to_load)
housing_data.head(10)

Unnamed: 0,CONTROL,YRBUILT,UNITSIZE,NHQSCHOOL,TOTROOMS,NHQSCRIME,LOTSIZE,RATINGHS,RATINGNH,PORCH,BATHROOMS,BEDROOMS,BLD,MARKETVAL,OMB13CBSA
0,11000001,2000,6.0,1.0,8,2.0,2.0,10.0,10.0,1,4,3,2,307811.0,37980
1,11000002,1970,8.0,1.0,7,2.0,7.0,10.0,10.0,1,4,3,3,1005540.0,99998
2,11000005,1970,6.0,1.0,8,2.0,5.0,8.0,8.0,1,5,4,2,229992.0,99998
3,11000006,1980,4.0,1.0,5,2.0,3.0,10.0,10.0,1,3,3,2,132220.0,99998
4,11000007,1960,7.0,2.0,8,2.0,2.0,7.0,7.0,1,4,4,2,271584.0,37980
5,11000008,1919,3.0,,5,,,7.0,7.0,2,1,1,9,,99998
6,11000009,1970,5.0,,6,,2.0,,,1,3,3,2,877854.0,99998
7,11000010,1970,3.0,1.0,7,1.0,3.0,8.0,8.0,1,2,3,2,,99998
8,11000012,1960,2.0,1.0,3,1.0,,3.0,2.0,1,1,1,7,,99998
9,11000013,1980,8.0,1.0,8,2.0,3.0,8.0,8.0,1,4,3,2,573123.0,37980


In [13]:
# Check for NULL values
housing_data.isnull().sum()

CONTROL          0
YRBUILT          0
UNITSIZE      7857
NHQSCHOOL    14956
TOTROOMS         0
NHQSCRIME    11086
LOTSIZE      21459
RATINGHS     10809
RATINGNH     10889
PORCH            0
BATHROOMS        0
BEDROOMS         0
BLD              0
MARKETVAL    26801
OMB13CBSA        0
dtype: int64

In [14]:
# Drop all NULL values
housing_data = housing_data.dropna()

In [15]:
housing_data.isnull().sum()

CONTROL      0
YRBUILT      0
UNITSIZE     0
NHQSCHOOL    0
TOTROOMS     0
NHQSCRIME    0
LOTSIZE      0
RATINGHS     0
RATINGNH     0
PORCH        0
BATHROOMS    0
BEDROOMS     0
BLD          0
MARKETVAL    0
OMB13CBSA    0
dtype: int64

In [16]:
housing_df = housing_data.copy()

# MARLETVAL = 999998 is outlier. Let's see how many
housing_df_out = housing_df.loc[housing_df['MARKETVAL'] == 9999998]
len(housing_df_out.index)

14

In [19]:
# Ignore those outliers
housing_df = housing_df.loc[housing_df['MARKETVAL'] != 9999998]

# We will study single family houses only
housing_df = housing_df.loc[housing_df['BLD'] == 2]
housing_df.head()

Unnamed: 0,CONTROL,YRBUILT,UNITSIZE,NHQSCHOOL,TOTROOMS,NHQSCRIME,LOTSIZE,RATINGHS,RATINGNH,PORCH,BATHROOMS,BEDROOMS,BLD,MARKETVAL,OMB13CBSA
0,11000001,2000,6.0,1.0,8,2.0,2.0,10.0,10.0,1,4,3,2,307811.0,37980
2,11000005,1970,6.0,1.0,8,2.0,5.0,8.0,8.0,1,5,4,2,229992.0,99998
3,11000006,1980,4.0,1.0,5,2.0,3.0,10.0,10.0,1,3,3,2,132220.0,99998
4,11000007,1960,7.0,2.0,8,2.0,2.0,7.0,7.0,1,4,4,2,271584.0,37980
9,11000013,1980,8.0,1.0,8,2.0,3.0,8.0,8.0,1,4,3,2,573123.0,37980


In [20]:
len(housing_df.index)

24186

In [29]:
# Calculate coefficient and 2 tailed p-value
from scipy import stats
a = housing_df['UNITSIZE']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(0.30078455732470966, 0.0)

In [36]:
# What if we narrow the dataset to one city
housing_reduced_df = housing_df.loc[housing_df['OMB13CBSA'] == 38060]

In [37]:
# Calculate coefficient and 2 tailed p-value

a = housing_reduced_df['UNITSIZE']
b = housing_reduced_df['MARKETVAL']
stats.pearsonr(a, b)

(0.5927297754506997, 2.35809998393238e-93)