In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
import time as time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared

#### Load Preprocessed Datasets

In [None]:
housing_features_simple_imputer = pd.read_csv("/Users/ukannika/work/personal/machine-learning/datasets/housing_features_simple_imputer.csv", 
                 sep=",", header=None)

housing_features_iterative_imputer = pd.read_csv("/Users/ukannika/work/personal/machine-learning/datasets/housing_features_iterative_imputer.csv", 
                 sep=",", header=None)

housing_features_knn_imputer = pd.read_csv("/Users/ukannika/work/personal/machine-learning/datasets/housing_features_knn_imputer.csv", 
                 sep=",", header=None)

housing_labels = pd.read_csv("/Users/ukannika/work/personal/machine-learning/datasets/housing_labels.csv", sep=",", header=None)

test_housing_features = pd.read_csv("/Users/ukannika/work/personal/machine-learning/datasets/test_housing_features.csv", sep=",", header=None)
test_housing_labels = pd.read_csv("/Users/ukannika/work/personal/machine-learning/datasets/test_housing_labels.csv", sep=",", header=None)

print("housing_features_simple_imputer %s " % (housing_features_simple_imputer.shape,))
print("housing_features_iterative_imputer %s " % (housing_features_iterative_imputer.shape,))
print("housing_features_knn_imputer %s " % (housing_features_knn_imputer.shape,))
print("housing_labels %s " % (housing_labels.shape,))


print("test_housing_features %s " % (test_housing_features.shape,))
print("test_housing_labels %s " % (test_housing_labels.shape,))

#### Linear Regression

In linear regression, the target value is expected to be a combination of the features.

y(W, X) = XW + $\epsilon$

Closed form solution for W

W = $(X^TX)^{-1}X^TY$

**Cost Function** <br>
*MSE(Mean Squared Error)* <br>
*MAE(Mean Absolute Error)*


In [None]:
linear_regression = LinearRegression().fit(housing_features_simple_imputer, housing_labels)
housing_predictions = linear_regression.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

In [None]:
linear_regression = LinearRegression().fit(housing_features_iterative_imputer, housing_labels)
housing_predictions = linear_regression.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

In [None]:
linear_regression.coef_

#### Regularization L1 and L2

L1 => Lasso (Sparsity) <br>
L2 => Ridge (Shrink weights towards to zero)

In [None]:
# For this example, we may not see any improve by using Lasso/Ridge Regression.
# Tune hyperparameter alpha
ridge = Ridge(alpha=0.3, max_iter=10000).fit(housing_features_simple_imputer, housing_labels)
housing_predictions = ridge.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

In [None]:
lasso = Lasso(alpha=0.3, max_iter=50000).fit(housing_features_simple_imputer, housing_labels)
housing_predictions = lasso.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

In [None]:
# construct the set of hyperparameters to tune
print("Tuning hyperparameters via grid search")
params = {"alpha": np.arange(0.1, 2.0, 0.2)}
grid = GridSearchCV(ridge, params)
start = time.time()
grid.fit(housing_features_simple_imputer, housing_labels)

# evaluate the best grid searched model on the testing data
print("Grid search took {:.2f} seconds".format(time.time() - start))
acc = grid.score(test_housing_features, test_housing_labels)

print("Grid search accuracy: {:.2f}%".format(acc * 100))
print("Grid search best parameters: {}".format(grid.best_params_))

In [None]:
housing_predictions[:1]

#### K-NN Regression

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=5, leaf_size=30, p=2, metric='minkowski',
                                    weights='uniform', algorithm='ball_tree')

print("Tuning hyperparameters via grid search")
params = {"n_neighbors": np.arange(5, 15, 2), "leaf_size": np.arange(10, 30, 2), "p" : [1, 2]}
grid = GridSearchCV(knn_regressor, params)
start = time.time()
grid.fit(housing_features_simple_imputer, housing_labels)

# evaluate the best grid searched model on the testing data
print("Grid search took {:.2f} seconds".format(time.time() - start))
acc = grid.score(test_housing_features, test_housing_labels)

print("Grid search accuracy: {:.2f}%".format(acc * 100))
print("Grid search best parameters: {}".format(grid.best_params_))

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=9, leaf_size=26, p=2, metric='minkowski',
                                    weights='uniform', algorithm='ball_tree').fit(housing_features_simple_imputer, housing_labels)

housing_predictions = knn_regressor.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

#### Stochastic Gradient Descent


In [None]:
sgd_regressor = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.01, max_iter=2000, verbose=0, 
                                        random_state=None, learning_rate='invscaling', 
                                        eta0=0.01, power_t=0.25, 
                                        early_stopping=True, validation_fraction=0.1,
                                        n_iter_no_change=5, warm_start=False,
                                        average=False)

sgd_regressor = sgd_regressor.fit(housing_features_simple_imputer, housing_labels.values.ravel())

housing_predictions = sgd_regressor.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

#### Decision Trees

High Depth => High Variance => Bagging <br>
Low Depth  => High Bias     => Boosting <br>



In [None]:
# See how decision tree with full depth overfit our model. 
decision_tree_regressor = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, 
                                  min_samples_split=2, min_samples_leaf=1, 
                                  min_weight_fraction_leaf=0.0, max_features=None, random_state=None, 
                                  max_leaf_nodes=None, min_impurity_decrease=0.0,
                                  min_impurity_split=None, presort='deprecated',
                                  ccp_alpha=0.0)

decision_tree_regressor = decision_tree_regressor.fit(housing_features_simple_imputer, housing_labels.values.ravel())


housing_predictions = decision_tree_regressor.predict(housing_features_simple_imputer)

# Calculate Error
mse = mean_squared_error(y_true=housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

#### Better Evaluation Using Cross-Validation

In [None]:
scores = cross_val_score(estimator=decision_tree_regressor, X=housing_features_simple_imputer, y=housing_labels, 
                cv=10, scoring="neg_mean_squared_error")
tree_rmse_scores = np.sqrt(-scores)
print("Validation Score %d " %tree_rmse_scores.mean())


## Check prediction score
housing_predictions = decision_tree_regressor.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

#### Bagging

In [None]:
random_forest_regressor = RandomForestRegressor(n_estimators=100, criterion='mse',
                                             max_depth=None, min_samples_split=5, min_samples_leaf=1, 
                                             min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
                                             min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, 
                                             oob_score=True, 
                                             n_jobs=None, random_state=None, verbose=0, 
                                             warm_start=False, 
                                             ccp_alpha=0.0, max_samples=None)

random_forest_regressor = random_forest_regressor.fit(housing_features_simple_imputer, housing_labels.values.ravel())


housing_predictions = random_forest_regressor.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

#### Boosting

In [None]:
gradient_boosting_regressor = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=1000, subsample=1.0, 
                                      criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, 
                                      min_weight_fraction_leaf=0.0, max_depth=4, min_impurity_decrease=0.0,
                                      min_impurity_split=None, init=None, random_state=None, 
                                      max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, 
                                      warm_start=False, presort='deprecated', validation_fraction=0.1,
                                      n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

gradient_boosting_regressor = gradient_boosting_regressor.fit(housing_features_simple_imputer, housing_labels.values.ravel())


housing_predictions = gradient_boosting_regressor.predict(test_housing_features)

# Calculate Error
mse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=True)
rmse = mean_squared_error(y_true=test_housing_labels, y_pred=housing_predictions, squared=False)
mae = mean_absolute_error(y_true=test_housing_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("RMSE : %0.3f " % rmse)
print("MAE : %0.3f " % rmse)

#### Guassian Process

In [None]:
k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
k2 = 2.4**2 * RBF(length_scale=90.0) \
    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
# medium term irregularity
k3 = 0.66**2 \
    * RationalQuadratic(length_scale=1.2, alpha=0.78)
k4 = 0.18**2 * RBF(length_scale=0.134) \
    + WhiteKernel(noise_level=0.19**2)  # noise terms
kernel_gpml = k1 + k2 + k3 + k4
gaussian_process_regression = GaussianProcessRegressor(kernel=k1, alpha=0.001, 
                                                        optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, 
                                                        normalize_y=False, 
                                                        copy_X_train=True, random_state=None)

gaussian_process_regression = gaussian_process_regression.fit(housing_features_simple_imputer, housing_labels.values.ravel())


housing_predictions = gaussian_process_regression.predict(test_housing_features)

housing_predictions

In [None]:
housing_predictions = gaussian_process_regression.predict(test_housing_features)