In [1]:
import pandas as pd
import numpy as np
import sklearn
import pickle, joblib

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.metrics import mean_squared_error
from scipy import stats



In [2]:
import warnings
warnings.filterwarnings("ignore")

### 1. Trying building SVR models with kernels `linear` and `rbf` respectively, hypertuning <br><br>the params `C` and `gamma`.

In [3]:
## Loading the training featrues and labels

X_prepared = np.loadtxt("X_prepared.txt")
Y = pd.read_csv("Y.csv")

In [4]:
## Our baseline model

from sklearn.svm import SVR

svr_mod1 = SVR()
svr_mod1.fit(X_prepared, Y)

#### Cross-Validation

In [5]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svr_mod1, X_prepared, Y, cv=5, scoring='neg_mean_squared_error')
mod1_scores = np.sqrt(-scores)

In [6]:
## A function to display Scores

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())

In [7]:
## score of our baseline model

display_scores(mod1_scores)

Scores:  [118574.74777986 117059.80714396 118838.85950729 119130.42931938
 119589.76542706]
Mean:  118638.72183551211
Standard Deviation:  858.2982620717287


#### => far worse than our prior models. But let's give a try tuning the other hyperparams `kernels`, `C` and `gamma` anyway.
The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. l2 regularization penalizes the sum of squares of residuals.

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
## Hyperparam tuning of `C` using Grid Search

grid_param = [{"kernel": ["linear"], "C": [30, 100, 300, 1000]},
              {"kernel": ["rbf"], "C": [30, 100, 300, 1000], "gamma": [3, 10, 30]}]

grid_search = GridSearchCV(svr_mod1, grid_param, cv=5, scoring='neg_mean_squared_error', 
                       return_train_score=True, verbose=2)
grid_search.fit(X_prepared, Y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ................................C=30, kernel=linear; total time=   7.4s
[CV] END ................................C=30, kernel=linear; total time=   7.5s
[CV] END ................................C=30, kernel=linear; total time=  10.7s
[CV] END ................................C=30, kernel=linear; total time=   8.4s
[CV] END ................................C=30, kernel=linear; total time=   7.9s
[CV] END ...............................C=100, kernel=linear; total time=   9.3s
[CV] END ...............................C=100, kernel=linear; total time=   7.4s
[CV] END ...............................C=100, kernel=linear; total time=   8.8s
[CV] END ...............................C=100, kernel=linear; total time=   7.5s
[CV] END ...............................C=100, kernel=linear; total time=   8.1s
[CV] END ...............................C=300, kernel=linear; total time=   7.7s
[CV] END ...............................C=300, k

In [10]:
## best params
print("best params: ", grid_search.best_params_)

## best score
print("best score: ", np.sqrt(-grid_search.best_score_))

best params:  {'C': 1000, 'kernel': 'linear'}
best score:  70388.13264944841


#### => It has improved but still no where near our prior models.

In [14]:
## best estimator

final_mod = grid_search.best_estimator_
final_mod

In [15]:
## Accuracy

final_mod.score(X_prepared, Y)

0.631193991791648

#### => Not did any major good!

In [19]:
## saving this model anywayy
import joblib

joblib.dump(final_mod, open("svr_gridhypertuned.pkl", "wb"))

### 2. Adding a transformer in the preparation pipelline to select only the most important features.

In [71]:
## Importing a class contributing to the fullpipeline

from CombinedAttributeAdder import CombinedAttributesAdder2

In [72]:
## Loading the fullpipeline
import pickle

full_pipeline = pickle.load(open("transformation_pipeline.pkl", "rb"))
full_pipeline

### numpy.argpartition() 
Used to create an indirect partitioned copy of input array with its elements rearranged in such a way that the value of the element in k-th position is in the position it would be in a sorted array. All elements smaller than the k-th element are moved before this element and all equal or greater are moved behind it. **The ordering of the elements in the two partitions is undefined.** It returns an array of indices of the same shape as arr, i.e arr[index_array] yields a partition of arr.

In [4]:
## Loading the finalized model (Random Forest Regressor)

forest_reg = pickle.load(open("model.pkl", "rb"))
forest_reg

In [5]:
feature_importances = forest_reg.feature_importances_
feature_importances

array([9.80897713e-02, 8.32794269e-02, 4.47874551e-02, 2.07388807e-02,
       2.07810704e-02, 2.90845862e-02, 1.99604771e-02, 3.81541790e-01,
       4.96928799e-02, 3.63246328e-02, 4.75673840e-02, 8.56693405e-03,
       1.51601677e-01, 4.10338163e-05, 2.44638683e-03, 5.49561336e-03])

In [24]:
## Load raw features

X = pd.read_csv("X.csv")
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN


In [56]:
from sklearn.base import BaseEstimator, TransformerMixin

def top_k_indices(feature_importances, k):
    """
    Returns the top k indices contributing the most in feature_importances.
    """
    return np.sort(np.argpartition(np.array(feature_importances), -k)[-k:])

class TopFeaturesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices = top_k_indices(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices]  

In [57]:
## Say we want only top 5 features, k=5
# let's fetch those indices

top5 = TopFeaturesSelector(feature_importances=feature_importances, k=5)
top5.get_params

<bound method BaseEstimator.get_params of TopFeaturesSelector(feature_importances=array([9.80897713e-02, 8.32794269e-02, 4.47874551e-02, 2.07388807e-02,
       2.07810704e-02, 2.90845862e-02, 1.99604771e-02, 3.81541790e-01,
       4.96928799e-02, 3.63246328e-02, 4.75673840e-02, 8.56693405e-03,
       1.51601677e-01, 4.10338163e-05, 2.44638683e-03, 5.49561336e-03]),
                    k=5)>

In [66]:
## resulting array after removal of mundane features

top5.fit_transform(X_prepared)

array([[-0.94135046,  1.34743822, -0.8936472 , -0.12112176,  1.        ],
       [ 1.17178212, -1.19243966,  1.292168  , -0.81086696,  0.        ],
       [ 0.26758118, -0.1259716 , -0.52543365, -0.33827252,  1.        ],
       ...,
       [-1.5707942 ,  1.31001828, -0.36547546,  0.32286937,  0.        ],
       [-1.56080303,  1.2492109 ,  0.16826095, -0.45702273,  0.        ],
       [-1.28105026,  2.02567448, -0.390569  , -0.12169672,  1.        ]])

### # Let's now create the new pipeline:

In [76]:
from sklearn.pipeline import Pipeline

k=5
preparation_and_feature_selection_pipeline = Pipeline([
    ("preparation", full_pipeline), 
    ("feature_selection", TopFeaturesSelector(feature_importances, k))], verbose=2)

In [77]:
preparation_and_feature_selection_pipeline

In [78]:
X_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(X)
X_prepared_top_k_features

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing attirbs_adder, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing std_scaler, total=   0.0s
[Pipeline] ....... (step 1 of 2) Processing preparation, total=   0.1s
[Pipeline] . (step 2 of 2) Processing feature_selection, total=   0.0s


array([[-0.94135046,  1.34743822, -0.8936472 , -0.12112176,  1.        ],
       [ 1.17178212, -1.19243966,  1.292168  , -0.81086696,  0.        ],
       [ 0.26758118, -0.1259716 , -0.52543365, -0.33827252,  1.        ],
       ...,
       [-1.5707942 ,  1.31001828, -0.36547546,  0.32286937,  0.        ],
       [-1.56080303,  1.2492109 ,  0.16826095, -0.45702273,  0.        ],
       [-1.28105026,  2.02567448, -0.390569  , -0.12169672,  1.        ]])

=> Amazing! :)

### 3. Creating a single pipeline that does the full data preparation plus the final prediction.

In [79]:
ls

 Volume in drive D is Local Disk 
 Volume Serial Number is 407D-3EE5

 Directory of D:\data-science\projects\california-housing-price-prediction\notebooks

13-11-2022  22:32    <DIR>          .
13-11-2022  22:32    <DIR>          ..
13-11-2022  13:14             3,283 .gitignore
13-11-2022  21:36    <DIR>          .ipynb_checkpoints
13-11-2022  17:47    <DIR>          __pycache__
13-11-2022  13:33             2,416 CombinedAttributeAdder.py
26-10-2022  22:13    <DIR>          datasets
11-11-2022  11:05         1,485,461 housing.csv
11-11-2022  11:06         7,958,527 in1__housing (analysis).ipynb
11-11-2022  18:37            62,274 in2__housing (preprocessing).ipynb
13-11-2022  12:58            62,792 in3__housing (selecting and training a model).ipynb
13-11-2022  12:28           436,975 in4__housing (fine-tuning the model).ipynb
13-11-2022  12:03            83,756 in5.1__housing (tuned SVR model via RandomizedSearchCV).ipynb
13-11-2022  22:32            75,100 in5__housing (experiment

In [83]:
## loading the model we've built via randomized search

mod_randomized = joblib.load(open("svr_randomized.pkl", "rb"))
mod_randomized

In [84]:
## best params

best_params = {"C":157055.10989448498, "gamma":0.26497040005002437}

### `prep_select_predict_pipeline`

In [95]:
from sklearn.svm import SVR

prep_select_predict_pipeline = Pipeline([
    ("preparation", full_pipeline),
    ("feature_selection", TopFeaturesSelector(feature_importances, k)),
    ("model_building", SVR(**best_params))
], verbose=3)
prep_select_predict_pipeline

In [96]:
## Let's predict

some_data = X[19:45]
some_labels = Y[19:45]

prep_select_predict_pipeline.fit(X, Y)

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing attirbs_adder, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing std_scaler, total=   0.0s
[Pipeline] ....... (step 1 of 3) Processing preparation, total=   0.1s
[Pipeline] . (step 2 of 3) Processing feature_selection, total=   0.0s
[Pipeline] .... (step 3 of 3) Processing model_building, total=  15.2s


In [100]:
print("pipeline's predictions: ", prep_select_predict_pipeline.predict(some_data))
print("true labels: ", np.array(some_labels).reshape(1, -1))

pipeline's predictions:  [194588.56536495 118761.17623744 393928.19455644  95667.71628708
 166913.93404877 101098.50213907  86256.53744097 292821.12410301
 141496.39741588 185678.78457218 219672.76417889 250319.0875312
 186644.58016205 237973.84482392 314189.76506519 259559.23961626
 369769.14956919  84999.82183833 242891.33424194  84091.53511525
 163985.61785801 273481.24855822 125278.06020114 217385.81367714
 231904.89911647 183327.64499413]
true labels:  [[194500. 163100. 331200.  92000. 153400. 139100.  85900. 238100. 173500.
  196100. 442900. 234600. 195400. 264100. 415800. 282300. 345200.  85000.
  279400.  95700. 191800. 225400. 163500. 150300. 217100. 157500.]]
