### AIML: Decision Trees and Forest Regression, Classification (24th March 2022, Thursday)

### 1. REGRESSION

#### 1.1 Fetching DataSet for Regression

In [1]:
# Load the boston Dataset
import pandas as pd
from sklearn.datasets import load_boston

In [2]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
# data

In [3]:
data["MEDV"] = boston.target
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


Finding all Correlation values for features.

In [4]:
data.corr().round(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
CRIM,1.0,-0.2,0.41,-0.06,0.42,-0.22,0.35,-0.38,0.63,0.58,0.29,-0.39,0.46,-0.39
ZN,-0.2,1.0,-0.53,-0.04,-0.52,0.31,-0.57,0.66,-0.31,-0.31,-0.39,0.18,-0.41,0.36
INDUS,0.41,-0.53,1.0,0.06,0.76,-0.39,0.64,-0.71,0.6,0.72,0.38,-0.36,0.6,-0.48
CHAS,-0.06,-0.04,0.06,1.0,0.09,0.09,0.09,-0.1,-0.01,-0.04,-0.12,0.05,-0.05,0.18
NOX,0.42,-0.52,0.76,0.09,1.0,-0.3,0.73,-0.77,0.61,0.67,0.19,-0.38,0.59,-0.43
RM,-0.22,0.31,-0.39,0.09,-0.3,1.0,-0.24,0.21,-0.21,-0.29,-0.36,0.13,-0.61,0.7
AGE,0.35,-0.57,0.64,0.09,0.73,-0.24,1.0,-0.75,0.46,0.51,0.26,-0.27,0.6,-0.38
DIS,-0.38,0.66,-0.71,-0.1,-0.77,0.21,-0.75,1.0,-0.49,-0.53,-0.23,0.29,-0.5,0.25
RAD,0.63,-0.31,0.6,-0.01,0.61,-0.21,0.46,-0.49,1.0,0.91,0.46,-0.44,0.49,-0.38
TAX,0.58,-0.31,0.72,-0.04,0.67,-0.29,0.51,-0.53,0.91,1.0,0.46,-0.44,0.54,-0.47


 RM - ZN has the highest target with MEDV, so selecting RM & ZN for training the model

In [5]:
# Here, x corresponds to Train data, multiple features are being included now
x = data[['RM', 'ZN']]
# And y corresponds to Labels 
y = data['MEDV']

#### 1.2 Splitting the data

In [6]:
from sklearn.model_selection import train_test_split as tts


x_train, x_test, y_train, y_test = tts(x, y, test_size=.2)
y_train

122    20.5
394    12.7
489     7.0
131    19.6
426    10.2
       ... 
400     5.6
56     24.7
226    37.6
81     23.9
323    18.5
Name: MEDV, Length: 404, dtype: float64

#### 1.3 REGRESSORS

1.3.1 Decision Tree Regressor

In [7]:
# Creating the model by training it with the train data

from sklearn.tree import DecisionTreeRegressor # Importing the model from sklearn.trees


dTR = DecisionTreeRegressor(max_depth=5)       # Creating the model
dTR.fit(x_train, y_train)                      # Training the model

DecisionTreeRegressor(max_depth=5)

Getting Prediction results from test data

In [8]:
y_pred = dTR.predict(x_test)

f"Score: {dTR.score(x_test, y_test):.2%}"  # Checking score on test data

'Score: 45.40%'

Checking error in Predicted result by comparing it to Actual result by Mean Squared Error 

In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error as mse

np.sqrt(mse(y_test, y_pred))  # Mean Squared Error


6.854634698094764

1.3.2 Random Forest Regressor

In [10]:
from sklearn.ensemble import RandomForestRegressor   # Importing the model from sklearn.ensemble


rFR = RandomForestRegressor()                        # Creating the model
rFR.fit(x_train, y_train)                            # Training the model

y_pred2 = rFR.predict(x_test)                        # Getting predictions
print(f"Score: {rFR.score(x_test, y_test):.2%}")     # Checking scores

np.sqrt(mse(y_test, y_pred2))                        # Mean Squared Error

Score: 50.58%


6.521337342989637

### 2. CLASSIFICATION

#### 2.1 Fetching DataSet for Classification

In [11]:
from sklearn.datasets import load_digits


digits = load_digits()

data = pd.DataFrame(digits.data, columns=digits.feature_names)
data

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


Extracting x and y values

In [12]:
x = digits.data    # Pixels
y = digits.target  # Labels [Numbers]

#### 2.2 Splitting the data

In [13]:
x_train, x_test, y_train, y_test = tts(x, y, test_size=.25)
y_train

array([2, 9, 9, ..., 2, 2, 3])

#### 2.3 CLASSIFIERS

2.3.1 Decision Tree Classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier


dTC = DecisionTreeClassifier(criterion='entropy')
dTC.fit(x_train, y_train)

y_pred3 = dTC.predict(x_test)
print(f"Score: {dTC.score(x_test, y_test):.2%}")

np.sqrt(mse(y_test, y_pred3))

Score: 84.00%


1.768238294649979

2.3.2 Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier


rFC = RandomForestClassifier()
rFC.fit(x_train, y_train)

y_pred4 = rFC.predict(x_test)
print(f"Score: {rFC.score(x_test, y_test):.2%}")

np.sqrt(mse(y_test, y_pred4))

Score: 98.44%


0.5868938953886337