In [1]:
import os
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
## Loading the dataset

df = pd.read_csv('housing.csv')

 ## # Selecting and Training a model:

In [4]:
## laoding the prepared features and labels

X_prepared = np.loadtxt("X_prepared.txt")
Y = pd.read_csv('Y.csv')

In [5]:
## Going ahead w the LinearRegression first

from sklearn.linear_model import LinearRegression

In [6]:
### Linear Regression model:

mod1 = LinearRegression()
mod1.fit(X_prepared, Y)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

## CombinedAttributesAdder class

## An example of having all other extras attributes as hyperparameters

rooms_idx, bedrooms_idx, population_idx, households_idx = 3, 4, 5, 6

class CombinedAttributesAdder2(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True, add_rooms_per_household=True,
                add_population_per_household=True): # no *args or **kwargs
        
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.add_rooms_per_household = add_rooms_per_household
        self.add_population_per_household = add_population_per_household
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_idx] / X[:, rooms_idx]
            X = np.c_[X, bedrooms_per_room]
            
        if self.add_rooms_per_household:
            rooms_per_household = X[:, rooms_idx] / X[:, households_idx]
            X = np.c_[X, rooms_per_household]
        
        if self.add_population_per_household:
            population_per_household = X[:, bedrooms_idx] / X[:, rooms_idx]
            X = np.c_[X, population_per_household]
            
        return X

In [8]:
## loading our raw training features
X = pd.read_csv('X.csv')

## loading the transformation pipeline
full_pipeline = pickle.load(open('transformation_pipeline.pkl', 'rb'))

In [9]:
## Let's do prediction for some instances

some_data = X.iloc[:5]
some_labels = Y.iloc[:5]

# let's pass it through the transformation pipeline
some_data_prepared = full_pipeline.transform(some_data)

# predictions
mod1.predict(some_data_prepared)

array([[ 85806.2157108 ],
       [305390.62200336],
       [151996.97918871],
       [185908.83490813],
       [244568.86489392]])

In [10]:
some_labels

Unnamed: 0,median_house_value
0,72100.0
1,279600.0
2,82700.0
3,112500.0
4,238300.0


#### Evaluation:

In [11]:
## Evaluating the model via rmse metric

from sklearn.metrics import mean_squared_error

y_pred = mod1.predict(X_prepared)

mse = mean_squared_error(Y, y_pred)
rmse = np.sqrt(mse)
rmse

68635.27064635929

In [12]:
## Evaluating the model using r-squared metric

print("r-squared score (training): ", mod1.score(X_prepared, Y))

r-squared score (training):  0.648079515380494


In [13]:
### creating a dictionary to hold off all the scores of all the models to be trained

report = {"mod1 (Linear Regression)": {"r2": mod1.score(X_prepared, Y), "rmse": rmse}}
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929}}

In [14]:
## Saving the transformation pipeline

pickle.dump(full_pipeline, open('transformation_pipeline.pkl', 'wb'))

**=> Not that bad, but not that great!**

In [15]:
df.describe().loc[:, "median_house_value"]

count     20640.000000
mean     206855.816909
std      115395.615874
min       14999.000000
25%      119600.000000
50%      179700.000000
75%      264725.000000
max      500001.000000
Name: median_house_value, dtype: float64

As we can see, most of the districts' median_house_value lie bw **\$120,000** and **\$265,000**. Thus, prediction error of **\$68,635** ain't much satisfying. This concludes that **the features didn't provide enough information to make good predictions** or **it could be that our model ain't powerful enough**.

<br>Possible solutions:
* #### Selecting a more powerful model,
* #### Reduce the constraints (but we can't since we didn't apply the regularization in the first place),
* #### or To feed the training algo w better features.

### # Trying w more powerful model -- `Decision Tree Regressor`

In [16]:
## Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

mod2 = DecisionTreeRegressor()
mod2.fit(X_prepared, Y)

In [17]:
## Evaluating the TreeRegressor on the training set

y_pred_tree = mod2.predict(X_prepared)
mse_tree = mean_squared_error(Y, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
rmse_tree

0.0

In [18]:
report["mod2 (DescisionTreeRegressor)"] = {"r2": mod2.score(X_prepared, Y), "rmse": rmse_tree}
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929},
 'mod2 (DescisionTreeRegressor)': {'r2': 1.0, 'rmse': 0.0}}

#### => Shamelessly overfitting the data!

### # Better Evaluation of the Decision Tree using  Cross-Validation:

Since it's outta the question, to touch the test_set until we're ready to launch a model we're confident about, thus we need to use part of the training_set itself for training as well as part of it for the model validation.

#### Let's split the data into some distinct subset called folds, say 10, and use on outta it for the validation purpose and other 9 for the training. 
#### => Cross validation here would mean to randomly pick 1 validation set out of the 10 and train the model on other 9 and repeat it until all folds have had been once a validation set.

In [19]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(mod2, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

#### Note:
Scikit-Learns' cross-validation feature expects a **utility function (greater is better)** rather than a **cost function (lower is better)**, thus the scoring is actually the opposite of MSE.

In [20]:
## A fucntion to display Scores

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())

In [21]:
## Decision tree scores

display_scores(tree_rmse_scores)

Scores:  [72407.80031886 68491.02730101 68931.17295712 70374.84635462
 69389.86999164 71697.73323972 72758.13131524 67694.81368192
 64086.09078811 73016.86071752]
Mean:  69884.83466657538
Standard Deviation:  2635.835186950802


In [22]:
report["mod2.1 (dt cv)"] = {"rmse": 69620.91130100285}
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929},
 'mod2 (DescisionTreeRegressor)': {'r2': 1.0, 'rmse': 0.0},
 'mod2.1 (dt cv)': {'rmse': 69620.91130100285}}

**The cross-validation allows to get not only an estimate of the performance of the model but also a measure of how precise this measure is i.e. standard deviation. Error is 69,620 with standard error of  2791.**

### # Let's do the CV for Linear Regression model too!

In [23]:
nmse = cross_val_score(mod1, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
linreg_scores = np.sqrt(-nmse)

In [24]:
## displaying the Linear Regressor scores

display_scores(linreg_scores)

Scores:  [71742.79930634 64123.15164868 67525.9485056  68669.59284419
 66229.30580385 72536.14551594 74027.50779795 68826.23855501
 66448.19745248 70145.70900636]
Mean:  69027.45964364048
Standard Deviation:  2947.7533607572213


In [25]:
report["mod1.1 (lr cv)"] = {"rmse": 69027.45964364048}
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929},
 'mod2 (DescisionTreeRegressor)': {'r2': 1.0, 'rmse': 0.0},
 'mod2.1 (dt cv)': {'rmse': 69620.91130100285},
 'mod1.1 (lr cv)': {'rmse': 69027.45964364048}}

#### => Means Decision tree in our case is overfitting so badly that it performs worse than the Linear Regressor.

### # Trying with even powerful model -- `Random Forest`

In [26]:
from sklearn.ensemble import RandomForestRegressor

mod3 = RandomForestRegressor()
mod3.fit(X_prepared, Y)

In [27]:
## Evaluation

y_pred = mod3.predict(X_prepared)
np.sqrt(mean_squared_error(Y, y_pred))

18419.39162737864

In [28]:
report["mod3 (RandomForestRegressor)"] = {"rmse": 18490.47999527855}
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929},
 'mod2 (DescisionTreeRegressor)': {'r2': 1.0, 'rmse': 0.0},
 'mod2.1 (dt cv)': {'rmse': 69620.91130100285},
 'mod1.1 (lr cv)': {'rmse': 69027.45964364048},
 'mod3 (RandomForestRegressor)': {'rmse': 18490.47999527855}}

In [29]:
## Cross Validation of Random Forests

scores = cross_val_score(mod3, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
rf_scores = np.sqrt(-scores)

In [30]:
## Displaying the cv scores of Random forests

display_scores(rf_scores)

Scores:  [50948.08142236 49439.43664076 46265.80598149 51130.12852477
 47811.26924179 50079.57151998 51283.79400152 48879.05921963
 47399.07154583 53435.40564286]
Mean:  49667.16237409885
Standard Deviation:  2039.4858726217683


#### ..much much better than the prior two approaches. However, badly overfitting on training set!

In [31]:
report["mod3.1 (rf cv)"]={"rmse": 49720.022020119024}
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929},
 'mod2 (DescisionTreeRegressor)': {'r2': 1.0, 'rmse': 0.0},
 'mod2.1 (dt cv)': {'rmse': 69620.91130100285},
 'mod1.1 (lr cv)': {'rmse': 69027.45964364048},
 'mod3 (RandomForestRegressor)': {'rmse': 18490.47999527855},
 'mod3.1 (rf cv)': {'rmse': 49720.022020119024}}

**=> Scores are much better than the previous two approaches but it should be noted that model is badly overfitting the training set than the validation ones. We gotta do something with our baseline model (like tweaking the Hyperparameters).**

### # Trying with even powerful model -- `SVM`

In [32]:
from sklearn.svm import SVR

#### SVR with `linear` kernel

In [33]:
mod4 = SVR(kernel='linear')
mod4.fit(X_prepared, Y)
mod4

In [34]:
## Evaluation

print(mod4.score(X_prepared, Y))
mean_squared_error(Y, mod4.predict(X_prepared))

0.08317903466039667


12272546605.39856

#### => like seriously!

In [35]:
## Cross Validation of SVR

scores = cross_val_score(mod4, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
svr_scores = np.sqrt(-scores)

In [36]:
display_scores(svr_scores)

Scores:  [110247.24644452 112547.38377657 106729.30309747 113273.00328007
 107427.45346862 115718.28356868 113000.34464361 111441.54332923
 112768.27643916 111566.54214715]
Mean:  111471.93801950678
Standard Deviation:  2584.7354757719027


In [37]:
## Cross Validation of SVR

scores = cross_val_score(mod4, X_prepared, Y, cv=10)

In [38]:
### displaying the r2 scores

display_scores(scores)

Scores:  [0.0708212  0.05631154 0.10307918 0.0643442  0.09337598 0.049326
 0.05836646 0.06493624 0.06782058 0.07785953]
Mean:  0.07062409038114995
Standard Deviation:  0.015848428309874703


#### => not gonna do any good, rejected!

#### SVR with `poly` kernel

In [39]:
mod5 = SVR(kernel='poly')
mod5.fit(X_prepared, Y)
mod5

In [40]:
## Evaluation

mod5.score(X_prepared, Y)

-0.046419034271956594

#### => SVR w `poly` kernel rejected!

#### SVR with `rbf` kernel

In [41]:
mod6 = SVR(kernel='rbf')
mod6.fit(X_prepared, Y)
mod6

In [42]:
## Evaluation

mod6.score(X_prepared, Y)

-0.05042656579228266

#### => goes without saying, rejected!

#### SVR with `sigmoid` kernel

In [43]:
mod7 = SVR(kernel='sigmoid')
mod7.fit(X_prepared, Y)
mod7

In [44]:
## Evaluation

mod7.score(X_prepared, Y)

-0.044969058536048045

#### => rejected!

### => And hence, SVM didn't workout so well for us.

In [45]:
### Saving the models locally

import pickle

pickle.dump(mod1, open("LinearRegressor.pkl", 'wb'))
pickle.dump(mod1, open("DecisionTreeRegressor.pkl", 'wb'))
pickle.dump(mod1, open("RandomForestRegressor.pkl", 'wb'))

In [46]:
report

{'mod1 (Linear Regression)': {'r2': 0.648079515380494,
  'rmse': 68635.27064635929},
 'mod2 (DescisionTreeRegressor)': {'r2': 1.0, 'rmse': 0.0},
 'mod2.1 (dt cv)': {'rmse': 69620.91130100285},
 'mod1.1 (lr cv)': {'rmse': 69027.45964364048},
 'mod3 (RandomForestRegressor)': {'rmse': 18490.47999527855},
 'mod3.1 (rf cv)': {'rmse': 49720.022020119024}}

In [49]:
### Saving the scores report in json format

import json

json.dump(report, open("scores_report.json", "w"), indent=4)