In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 


In [2]:
boston = datasets.load_boston()
data = pd.DataFrame(boston.data,columns=boston.feature_names)
targets = boston.target

In [3]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
data['Price']=boston.target
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
y=data['Price']
X = data.drop(['Price'],axis=1)
print(X.shape,y.shape)

(506, 13) (506,)


In [6]:
scaler =StandardScaler()

X_scaled= scaler.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 100)

In [15]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(random_state=6)
rf.fit(X_train,y_train)

print("Training Score: ",rf.score(X_train,y_train))
print("Testing Score",rf.score(X_test,y_test))

Training Score:  0.9661739347045992
Testing Score 0.8935278309174224




### Hyper Parameter Tuning With Grid Search CV

In [37]:
grid_param = {
    "n_estimators" : [90,100,115,130],
    'criterion': ['mse', 'mae'],
    'max_depth' : range(2,50,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

In [40]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=rf,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 12672 candidates, totalling 63360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 5400 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 6264 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | e

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=6,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth'

In [41]:
grid_search.best_params_

{'criterion': 'mae',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 90}

### Model Building with Train Test Split

In [232]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 80)

In [235]:
rf=RandomForestRegressor(criterion= 'mae',
 max_depth= 10,
 max_features= 'log2',
 min_samples_leaf= 2,
 min_samples_split= 2,
 n_estimators= 70)
rf.fit(X_train,y_train)

print("Train Score: ",rf.score(X_train,y_train))
print("Test Score: ",rf.score(X_test,y_test))

Train Score:  0.9466503921479957
Test Score:  0.9048502653314093


### Model Building with K fold Cross Validation

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
KFold

cv = KFold(n_splits=5)            # Desired number of Cross Validation folds
accuracies = list()
max_attributes = 20
depth_range = range(1, max_attributes + 1)

for depth in depth_range:
    fold_accuracy = []
    tree_model = RandomForestRegressor(max_depth = depth)
    
    for train_fold, valid_fold in cv.split(data):
        

        model = tree_model.fit(X_scaled , y ) # We fit the model with the fold train data
        valid_acc = model.score(X_scaled , y )# We calculate accuracy with the fold validation data
        fold_accuracy.append(valid_acc)

    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)
    # print("Accuracy per fold: ", fold_accuracy, "\n")
    # print("Average accuracy: ", avg)
    # print("\n")
    
# Just to show results conveniently
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))



 Max Depth  Average Accuracy
         1          0.585006
         2          0.753729
         3          0.860273
         4          0.904520
         5          0.930054
         6          0.946736
         7          0.954573
         8          0.962410
         9          0.967740
        10          0.971213
        11          0.971854
        12          0.974013
        13          0.973139
        14          0.976632
        15          0.973172
        16          0.973269
        17          0.975939
        18          0.976135
        19          0.975318
        20          0.975569


In [50]:
rand=RandomForestRegressor(criterion= 'mae',
 max_depth= 16,
 max_features= 'log2',
 min_samples_leaf= 1,
 min_samples_split= 2,
 n_estimators= 70,oob_score=True)

rand.fit(X_train,y_train)

acc_RandomForest_train = round(rand.score(X_train, y_train) * 100, 2)
acc_RandomForest_test = round(rand.score(X_test, y_test) * 100, 2)
print ("Train Accuracy : ",acc_RandomForest_train)
print ("Test Accuracy : ",acc_RandomForest_test)


Train Accuracy :  97.88
Test Accuracy :  88.18


This Model is Overfitted. Previous Model was better