In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
%matplotlib inline
data = pd.read_csv('housing.csv')

In [2]:
data.head()

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


In [3]:
client_data = pd.DataFrame([[5, 17, 15], [4, 32, 22], [8, 3, 12]], columns=data.columns[:-1])
client_data

Unnamed: 0,RM,LSTAT,PTRATIO
0,5,17,15
1,4,32,22
2,8,3,12


- Use LGBMRegressor

In [4]:
RANDOM = 1293
X_train = data.drop('MEDV', axis = 1)
X_test = client_data.copy()
y_train = data['MEDV'].copy()

oof_pred = np.zeros(X_train.shape[0])
y_pred = np.zeros(X_test.shape[0])
folds = KFold(n_splits= 5, shuffle=True, random_state=RANDOM)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    reg = LGBMRegressor(n_estimators=50, num_leaves=8, objective='tweedie')

    reg.fit(train_x, train_y)
    oof_pred[valid_idx] = reg.predict(valid_x)
    y_pred += reg.predict(X_test) / folds.n_splits
    print('Fold {:2d} R2 score : {:.6f}'.format(n_fold + 1, r2_score(valid_y, oof_pred[valid_idx])))
print('Full R2 score {:.6f}'.format(r2_score(y_train, oof_pred)))

Fold  1 R2 score : 0.787244
Fold  2 R2 score : 0.771560
Fold  3 R2 score : 0.878091
Fold  4 R2 score : 0.821819
Fold  5 R2 score : 0.856832
Full R2 score 0.826891


In [5]:
for i, price in enumerate(y_pred):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

Predicted selling price for Client 1's home: $390,667.76
Predicted selling price for Client 2's home: $231,726.97
Predicted selling price for Client 3's home: $896,988.71


- Use DecisionTreeRegressor

In [6]:
RANDOM = 1293
X_train = data.drop('MEDV', axis = 1)
X_test = client_data.copy()
y_train = data['MEDV'].copy()

oof_pred = np.zeros(X_train.shape[0])
y_pred = np.zeros(X_test.shape[0])
folds = KFold(n_splits= 5, shuffle=True, random_state=RANDOM)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    regressor = DecisionTreeRegressor(max_depth=6, random_state=RANDOM)

    regressor.fit(train_x, train_y)
    oof_pred[valid_idx] = regressor.predict(valid_x)
    y_pred += regressor.predict(X_test) / folds.n_splits
    print('Fold {:2d} R2 score : {:.6f}'.format(n_fold + 1, r2_score(valid_y, oof_pred[valid_idx])))
print('Full R2 score {:.6f}'.format(r2_score(y_train, oof_pred)))

Fold  1 R2 score : 0.724077
Fold  2 R2 score : 0.718925
Fold  3 R2 score : 0.784423
Fold  4 R2 score : 0.704583
Fold  5 R2 score : 0.774694
Full R2 score 0.743507


In [7]:
for i, price in enumerate(y_pred):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

Predicted selling price for Client 1's home: $412,595.33
Predicted selling price for Client 2's home: $241,789.17
Predicted selling price for Client 3's home: $948,270.00
