In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('kc_house_data_train.csv')

In [3]:
df = df.drop(columns='Unnamed: 0')

In [4]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x: x.year)

In [55]:
df['yr_renovated'] = [0 if x == 0  else 1 for x in df['yr_renovated']]
df['total_sqft'] = df['sqft_living']+df['sqft_lot']

In [56]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,total_sqft
0,2591820310,2014,365000.0,4,2.25,2070,8893,2.0,0,0,...,2070,0,1986,0,98058,47.4388,-122.162,2390,7700,10963
1,7974200820,2014,865000.0,5,3.0,2900,6730,1.0,0,0,...,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283,9630
2,7701450110,2014,1038000.0,4,2.5,3770,10893,2.0,0,2,...,3770,0,1997,0,98006,47.5646,-122.129,3710,9685,14663
3,9522300010,2015,1490000.0,3,3.5,4560,14608,2.0,0,2,...,4560,0,1990,0,98034,47.6995,-122.228,4050,14226,19168
4,9510861140,2014,711000.0,3,2.5,2550,5376,2.0,0,0,...,2550,0,2004,0,98052,47.6647,-122.083,2250,4050,7926


In [57]:
X = df.drop(columns=['price','id'])
y = df['price']

In [58]:
X.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,total_sqft
0,2014,4,2.25,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700,10963
1,2014,5,3.0,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283,9630
2,2014,4,2.5,3770,10893,2.0,0,2,3,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685,14663
3,2015,3,3.5,4560,14608,2.0,0,2,3,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226,19168
4,2014,3,2.5,2550,5376,2.0,0,0,3,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050,7926


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [60]:
dumb = DummyRegressor()
dumb.fit(X_train,y_train)

DummyRegressor(constant=None, quantile=None, strategy='mean')

In [61]:
dumb.score(X_test, y_test)

-0.00012400941386658815

In [62]:
ss = StandardScaler()
ss_train = ss.fit_transform(X_train)
ss_test = ss.transform(X_test)

In [63]:
ss_Train_df = pd.DataFrame(ss_train,columns=X_train.columns)
ss_Test_df = pd.DataFrame(ss_test,columns=X_test.columns)

In [64]:
gbr = RandomForestRegressor(n_estimators=1000,max_depth=20,random_state=42)
gbr.fit(ss_Train_df,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [65]:
gbr.score(ss_test,y_test)

0.8811334973444642

In [66]:
gbr.score(ss_train,y_train)

0.9822030602760982

In [69]:
def score_model(model, ss_test, y_test):
    return  {'r^2' : model.score(ss_test, y_test), 'rmse' : mean_squared_error(y_test, model.predict(ss_test)) ** .5}

In [70]:
score_model(gbr,ss_test,y_test)

{'r^2': 0.8811334973444642, 'rmse': 130510.78124556184}

In [71]:
sorted(list(zip(gbr.feature_importances_,X_test.columns)))

[(0.0013318926557813065, 'yr_renovated'),
 (0.0018566490984503966, 'floors'),
 (0.0020583240788356163, 'date'),
 (0.0033730688123078034, 'bedrooms'),
 (0.0035773574566581126, 'condition'),
 (0.005318376845871813, 'sqft_basement'),
 (0.008084952604190466, 'sqft_lot'),
 (0.009624585072368172, 'total_sqft'),
 (0.011614210633883815, 'sqft_lot15'),
 (0.012515029797943559, 'zipcode'),
 (0.016616671744819542, 'bathrooms'),
 (0.01783117073915456, 'view'),
 (0.018163832786720166, 'sqft_above'),
 (0.020817939442185174, 'yr_built'),
 (0.022816584975177877, 'waterfront'),
 (0.029866637958634815, 'sqft_living15'),
 (0.07072467484990588, 'long'),
 (0.16425904608690522, 'lat'),
 (0.21261941154765873, 'grade'),
 (0.3669295828125469, 'sqft_living')]

In [20]:
pc = PCA(n_components=16)
X_pc = pc.fit_transform(ss_train)
X_pc_test = pc.transform(ss_test)

In [21]:
gbr.fit(X_pc,y_train)
gbr.score(X_pc,y_train)

0.9752812596237954

In [22]:
gbr.score(X_pc_test,y_test)

0.8210895649981456

In [23]:
pc.explained_variance_ratio_

array([0.32436408, 0.12189877, 0.10628885, 0.07808877, 0.07560026,
       0.05379846, 0.0444968 , 0.04098136, 0.03229548, 0.02813244,
       0.02533441, 0.01923445, 0.01839429, 0.01418555, 0.01238348,
       0.00452254])