In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from random import gauss
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats as stats

%matplotlib inline
pd.set_option('display.max_columns', None) #shows all columns on dataframe
pd.set_option('display.float_format', lambda x: '%.2f' % x) #applied to every df in this notebook

In [2]:
df = pd.read_csv('./data/kc_house_data.csv')

In [3]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,Average,7 Average,1180,0.0,1955,0.0,98178,47.51,-122.26,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,Average,7 Average,2170,400.0,1951,1991.0,98125,47.72,-122.32,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,Average,6 Low Average,770,0.0,1933,,98028,47.74,-122.23,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,Very Good,7 Average,1050,910.0,1965,0.0,98136,47.52,-122.39,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,Average,8 Good,1680,0.0,1987,0.0,98074,47.62,-122.05,1800,7503


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  object 
 9   view           21534 non-null  object 
 10  condition      21597 non-null  object 
 11  grade          21597 non-null  object 
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [5]:
df['condition'].value_counts()
#difficult to make reccomendation base off of this since there's a lot of ways to define these terms
#use grade which is a little less vague

Average      14020
Good          5677
Very Good     1701
Fair           170
Poor            29
Name: condition, dtype: int64

In [6]:
df['yr_renovated'].value_counts()
#dropping columns base on the number of zeros
#potentially compared the zeros to the those that have ben renovated to see if there is a significant difference

0.00       17011
2014.00       73
2003.00       31
2013.00       31
2007.00       30
           ...  
1946.00        1
1959.00        1
1971.00        1
1951.00        1
1954.00        1
Name: yr_renovated, Length: 70, dtype: int64

In [40]:
#think that view might be important so we drop the nulls there for now
df_copy = df.copy()
df_copy['view']= df.view.fillna(value = 'NO RECORD')
df_drop_views

0        NONE
1        NONE
2        NONE
3        NONE
4        NONE
         ... 
21592    NONE
21593    NONE
21594    NONE
21595    NONE
21596    NONE
Name: view, Length: 21597, dtype: object

In [42]:
numbers = df_copy.select_dtypes(np.number)

In [43]:
df_2 = numbers.drop(['id', 'lat', 'long', 'zipcode'], axis = 1)

In [10]:
#picture = sns.pairplot(numbers)

In [11]:
#fig, ax = plt.subplots(figsize=(10,10))
#heat = sns.heatmap(df_2.corr(), annot = True)
#plt.savefig('heat.png',bbox = 'tight')

In [12]:
#visually the price of average, fair and good look the about the same so we would need to run a significant test to see
#excellent and none looks to be potentially significant
#use of anova here to see and ad-hoc tests

In [44]:
df.yr_renovated.fillna(value = 0, inplace = True)
#fill them with zero, might be mention in the next step to look at
#for now we can drop 

In [45]:
#we have some of the houses appearing more than once so we are trying to drop
#trial = copy_df.groupby(['id']).date.transform(max)
#trial_cleaned = copy_df[copy_df.id == trial]

In [46]:
#making the categorical variable
cate_column = df_copy[['grade', 'view']]

In [47]:
ohe = OneHotEncoder(drop='first')

column_name = ['grade', 'view']
features = cate_column.copy()

features2 = features[column_name]

cate_ohe = ohe.fit_transform(features2)

cate_ohe

<21597x15 sparse matrix of type '<class 'numpy.float64'>'
	with 41103 stored elements in Compressed Sparse Row format>

In [48]:
name = ohe.get_feature_names()

In [49]:
cate_parsed = pd.DataFrame(cate_ohe.todense(), columns = name)

In [50]:
cate_parsed
#X0 = grade
#x1 = view

Unnamed: 0,x0_11 Excellent,x0_12 Luxury,x0_13 Mansion,x0_3 Poor,x0_4 Low,x0_5 Fair,x0_6 Low Average,x0_7 Average,x0_8 Good,x0_9 Better,x1_EXCELLENT,x1_FAIR,x1_GOOD,x1_NO RECORD,x1_NONE
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00
21593,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00
21594,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
21595,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00


In [51]:
#to potentially drop sqft_lot and sqft_lot15
ss = StandardScaler()
#scaled_ = ss.fit_transform(df_2)

columns_name1 = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'floors', 'sqft_above']
numbers_scaling = df_2.copy().drop(['yr_renovated','sqft_lot15', 'sqft_lot', 'yr_built'], axis = 1)

features= numbers_scaling[columns_name1]

scale1 = ss.fit(features)
scale2 = scale1.transform(features)
numbers_scaling[columns_name1] = scale2

numbers_scaling

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,sqft_above,sqft_living15
0,-0.87,-0.40,-1.45,-0.98,-0.92,-0.74,1340
1,-0.01,-0.40,0.17,0.53,0.94,0.46,1690
2,-0.98,-1.48,-1.45,-1.43,-0.92,-1.23,2720
3,0.17,0.68,1.15,-0.13,-0.92,-0.89,1360
4,-0.08,-0.40,-0.15,-0.44,-0.92,-0.13,1800
...,...,...,...,...,...,...,...
21592,-0.49,-0.40,0.50,-0.60,2.79,-0.31,1530
21593,-0.38,0.68,0.50,0.25,0.94,0.63,1830
21594,-0.38,-1.48,-1.78,-1.15,0.94,-0.93,1020
21595,-0.38,-0.40,0.50,-0.52,0.94,-0.23,1410


In [52]:
scaled_and_parse_df = pd.concat([numbers_scaling, cate_parsed], axis = 1)

In [53]:
numbers_scaling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21597 non-null  float64
 1   bedrooms       21597 non-null  float64
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  float64
 4   floors         21597 non-null  float64
 5   sqft_above     21597 non-null  float64
 6   sqft_living15  21597 non-null  int64  
dtypes: float64(6), int64(1)
memory usage: 1.2 MB


In [54]:
cate_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   x0_11 Excellent   21597 non-null  float64
 1   x0_12 Luxury      21597 non-null  float64
 2   x0_13 Mansion     21597 non-null  float64
 3   x0_3 Poor         21597 non-null  float64
 4   x0_4 Low          21597 non-null  float64
 5   x0_5 Fair         21597 non-null  float64
 6   x0_6 Low Average  21597 non-null  float64
 7   x0_7 Average      21597 non-null  float64
 8   x0_8 Good         21597 non-null  float64
 9   x0_9 Better       21597 non-null  float64
 10  x1_EXCELLENT      21597 non-null  float64
 11  x1_FAIR           21597 non-null  float64
 12  x1_GOOD           21597 non-null  float64
 13  x1_NO RECORD      21597 non-null  float64
 14  x1_NONE           21597 non-null  float64
dtypes: float64(15)
memory usage: 2.5 MB


In [55]:
#for our regression and simple linear model
scaled_and_parse_df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,sqft_above,sqft_living15,x0_11 Excellent,x0_12 Luxury,x0_13 Mansion,x0_3 Poor,x0_4 Low,x0_5 Fair,x0_6 Low Average,x0_7 Average,x0_8 Good,x0_9 Better,x1_EXCELLENT,x1_FAIR,x1_GOOD,x1_NO RECORD,x1_NONE
0,-0.87,-0.40,-1.45,-0.98,-0.92,-0.74,1340,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,-0.01,-0.40,0.17,0.53,0.94,0.46,1690,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,-0.98,-1.48,-1.45,-1.43,-0.92,-1.23,2720,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,0.17,0.68,1.15,-0.13,-0.92,-0.89,1360,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,-0.08,-0.40,-0.15,-0.44,-0.92,-0.13,1800,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,-0.49,-0.40,0.50,-0.60,2.79,-0.31,1530,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00
21593,-0.38,0.68,0.50,0.25,0.94,0.63,1830,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00
21594,-0.38,-1.48,-1.78,-1.15,0.94,-0.93,1020,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
21595,-0.38,-0.40,0.50,-0.52,0.94,-0.23,1410,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00


In [34]:
scaled_and_parse_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21596 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             21534 non-null  float64
 1   bedrooms          21534 non-null  float64
 2   bathrooms         21534 non-null  float64
 3   sqft_living       21534 non-null  float64
 4   floors            21534 non-null  float64
 5   sqft_above        21534 non-null  float64
 6   sqft_living15     21534 non-null  float64
 7   x0_11 Excellent   21534 non-null  float64
 8   x0_12 Luxury      21534 non-null  float64
 9   x0_13 Mansion     21534 non-null  float64
 10  x0_3 Poor         21534 non-null  float64
 11  x0_4 Low          21534 non-null  float64
 12  x0_5 Fair         21534 non-null  float64
 13  x0_6 Low Average  21534 non-null  float64
 14  x0_7 Average      21534 non-null  float64
 15  x0_8 Good         21534 non-null  float64
 16  x0_9 Better       21534 non-null  float6

In [56]:
#check for nulls, so clean to work with for regression
scaled_and_parse_df.isnull().sum().sum()

0