In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
X,y = load_breast_cancer(return_X_y = True)

In [3]:
print("X.shape:", X.shape, "y.shape:", y.shape)

X.shape: (569, 30) y.shape: (569,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [5]:
print({"X_train.shape":X_train.shape, 
       "X_test.shape":X_test.shape,
       "y_train.shape":y_train.shape,
       "y_test.shape":y_test.shape
      })

{'X_train.shape': (512, 30), 'X_test.shape': (57, 30), 'y_train.shape': (512,), 'y_test.shape': (57,)}


In [6]:
# split_count = int(X_train.shape[0]*0.10)
# X_val = X_train[:split_count]
# y_val = y[:split_count]
# print({"X_val.shape":X_val.shape, 
#        "y_val.shape":y_val.shape
#       })

### Let's test !!!

In [7]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier # Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)# Train the model on training data
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=42)

In [8]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)# Calculate the absolute errors

In [9]:
# Import library for metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Mean absolute error (MAE)
mae = mean_absolute_error(y_test.ravel(), predictions)

# Mean squared error (MSE)
mse = mean_squared_error(y_test.ravel(), predictions)

# R-squared scores
r2 = r2_score(y_test.ravel(), predictions)

# Print metrics
print('Mean Absolute Error:', round(mae, 2))
print('Mean Squared Error:', round(mse, 2))
print('R-squared scores:', round(r2, 2))

Mean Absolute Error: 0.04
Mean Squared Error: 0.04
R-squared scores: 0.83


### Let's fine tuning

In [13]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Find the best parameters for the model
parameters = {
    'max_depth': [10, 20, 30,40, 50, 60, 70, 80, 90, 100],
    'n_estimators': [100, 200, 300, 400, 500, 1000]
}
gridforest = GridSearchCV(rf, parameters, cv = 3, n_jobs = -1, verbose = 1)
gridforest.fit(X_train, y_train)
gridforest.best_params_

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   15.9s finished


{'max_depth': 10, 'n_estimators': 200}

In [16]:
rf = RandomForestClassifier(n_estimators = gridforest.best_params_["n_estimators"], 
                            max_depth= gridforest.best_params_["max_depth"], 
                            n_jobs=-1, 
                            random_state = 42)# Train the model on training data
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)# Calculate the absolute errors

In [17]:
# Mean absolute error (MAE)
mae = mean_absolute_error(y_test.ravel(), predictions)

# Mean squared error (MSE)
mse = mean_squared_error(y_test.ravel(), predictions)

# R-squared scores
r2 = r2_score(y_test.ravel(), predictions)

# Print metrics
print('Mean Absolute Error:', round(mae, 2))
print('Mean Squared Error:', round(mse, 2))
print('R-squared scores:', round(r2, 2))

Mean Absolute Error: 0.04
Mean Squared Error: 0.04
R-squared scores: 0.83
