# Feature Selection with Pearson Correlation

In [1]:
# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

# Dataset:  2017nat_housing_survey1.csv

Source: American Housing Survey by US Census Bureau

https://www.census.gov/programs-surveys/ahs/data/2017/ahs-2017-public-use-file--puf-/2017-ahs-metropolitan-puf-microdata.html


In [2]:
# File to Load 
file_to_load = "dataset/2017nat_housing_survey1.csv"

# Read Purchasing File and store into Pandas data frame
housing_data = pd.read_csv(file_to_load)
housing_data.head(10)

Unnamed: 0,CONTROL,YRBUILT,UNITSIZE,NHQSCHOOL,TOTROOMS,NHQSCRIME,LOTSIZE,RATINGHS,RATINGNH,PORCH,BATHROOMS,BEDROOMS,BLD,MARKETVAL,OMB13CBSA
0,11000001,2000,6.0,1.0,8,2.0,2.0,10.0,10.0,1,4,3,2,307811.0,37980
1,11000002,1970,8.0,1.0,7,2.0,7.0,10.0,10.0,1,4,3,3,1005540.0,99998
2,11000005,1970,6.0,1.0,8,2.0,5.0,8.0,8.0,1,5,4,2,229992.0,99998
3,11000006,1980,4.0,1.0,5,2.0,3.0,10.0,10.0,1,3,3,2,132220.0,99998
4,11000007,1960,7.0,2.0,8,2.0,2.0,7.0,7.0,1,4,4,2,271584.0,37980
5,11000008,1919,3.0,,5,,,7.0,7.0,2,1,1,9,,99998
6,11000009,1970,5.0,,6,,2.0,,,1,3,3,2,877854.0,99998
7,11000010,1970,3.0,1.0,7,1.0,3.0,8.0,8.0,1,2,3,2,,99998
8,11000012,1960,2.0,1.0,3,1.0,,3.0,2.0,1,1,1,7,,99998
9,11000013,1980,8.0,1.0,8,2.0,3.0,8.0,8.0,1,4,3,2,573123.0,37980


In [3]:
# Check for NULL values
housing_data.isnull().sum()

CONTROL          0
YRBUILT          0
UNITSIZE      7857
NHQSCHOOL    14956
TOTROOMS         0
NHQSCRIME    11086
LOTSIZE      21459
RATINGHS     10809
RATINGNH     10889
PORCH            0
BATHROOMS        0
BEDROOMS         0
BLD              0
MARKETVAL    26801
OMB13CBSA        0
dtype: int64

In [4]:
# Drop all NULL values
housing_data = housing_data.dropna()

In [5]:
housing_data.isnull().sum()

CONTROL      0
YRBUILT      0
UNITSIZE     0
NHQSCHOOL    0
TOTROOMS     0
NHQSCRIME    0
LOTSIZE      0
RATINGHS     0
RATINGNH     0
PORCH        0
BATHROOMS    0
BEDROOMS     0
BLD          0
MARKETVAL    0
OMB13CBSA    0
dtype: int64

In [6]:
housing_df = housing_data.copy()

In [20]:
len(housing_df.index)

24186

In [9]:
# Calculate coefficient and 2 tailed p-value- YRBUILT

a = housing_df['YRBUILT']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(-0.012041295003353387, 0.04747421946210473)

In [8]:
# Calculate coefficient and 2 tailed p-value- UNITSIZE

a = housing_df['UNITSIZE']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(0.28810427428324364, 0.0)

In [11]:
# Calculate coefficient and 2 tailed p-value- TOTROOMS

a = housing_df['TOTROOMS']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(0.2843738701848939, 0.0)

In [12]:
# Calculate coefficient and 2 tailed p-value- LOTSIZE

a = housing_df['LOTSIZE']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(-0.024237111090246333, 6.608277851596298e-05)

In [13]:
# Calculate coefficient and 2 tailed p-value- RATINGNH

a = housing_df['RATINGNH']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(0.10649384239421161, 3.5884178112492e-69)

In [14]:
# Calculate coefficient and 2 tailed p-value- BATHROOMS

a = housing_df['BATHROOMS']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(0.3010126297073288, 0.0)

In [15]:
# Calculate coefficient and 2 tailed p-value- BEDROOMS

a = housing_df['BEDROOMS']
b = housing_df['MARKETVAL']
stats.pearsonr(a, b)

(0.2336746522473547, 0.0)