# Bagging Model

### Reading dataset

In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

tr_dataset = pd.read_csv('train_dataset.csv',index_col=[0])

In [2]:
tr_dataset.head()

Unnamed: 0,id,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Outcome,Class
337,837082,2,1,1,1,2,1,3,1,1,2,2
226,1231706,8,4,6,3,3,1,4,3,1,2,2
361,636437,1,1,1,1,2,1,1,1,1,2,2
381,1168278,3,1,1,1,2,1,2,1,1,2,2
454,1181685,1,1,2,1,2,1,2,1,1,2,2


## Creating Dataframe

In [3]:
features = ['id', 'Clump Thickness', 'UofCSize', 'UofCShape', 'Marginal Adhesion', 'SECSize', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']

X_train = tr_dataset.loc[:, features]
y_train = tr_dataset.loc[:,'Class'].values

In [4]:
X_train.head()

Unnamed: 0,id,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
337,837082,2,1,1,1,2,1,3,1,1
226,1231706,8,4,6,3,3,1,4,3,1
361,636437,1,1,1,1,2,1,1,1,1
381,1168278,3,1,1,1,2,1,2,1,1
454,1181685,1,1,2,1,2,1,2,1,1


In [5]:
y_train.shape

(478,)

## Hyperparameter Tuning

In [7]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [22]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 100, 250, 300, 350, 400, 500],
    'max_depth': [4, 6, 8, 10, 12, 14, 16, 18, 32, None] 
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 4, 'n_estimators': 50}

0.971 (+/-0.008) for {'max_depth': 4, 'n_estimators': 5}
0.971 (+/-0.02) for {'max_depth': 4, 'n_estimators': 50}
0.969 (+/-0.019) for {'max_depth': 4, 'n_estimators': 100}
0.969 (+/-0.023) for {'max_depth': 4, 'n_estimators': 250}
0.967 (+/-0.016) for {'max_depth': 4, 'n_estimators': 300}
0.969 (+/-0.019) for {'max_depth': 4, 'n_estimators': 350}
0.969 (+/-0.023) for {'max_depth': 4, 'n_estimators': 400}
0.969 (+/-0.023) for {'max_depth': 4, 'n_estimators': 500}
0.971 (+/-0.016) for {'max_depth': 6, 'n_estimators': 5}
0.967 (+/-0.025) for {'max_depth': 6, 'n_estimators': 50}
0.969 (+/-0.019) for {'max_depth': 6, 'n_estimators': 100}
0.967 (+/-0.016) for {'max_depth': 6, 'n_estimators': 250}
0.967 (+/-0.016) for {'max_depth': 6, 'n_estimators': 300}
0.969 (+/-0.023) for {'max_depth': 6, 'n_estimators': 350}
0.969 (+/-0.023) for {'max_depth': 6, 'n_estimators': 400}
0.969 (+/-0.023) for {'max_depth': 6, 'n_estimators': 500}
0.95 (+/-0.0

## Checking models performance on Validation dataset

In [23]:
Val_df = pd.read_csv('validation_dataset.csv', index_col=[0])

In [24]:
Val_df.head()

Unnamed: 0,id,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
668,466906,1,1,1,1,2,1,1,1,1,2
422,527337,4,1,1,1,2,1,1,1,1,2
426,677910,5,2,2,4,2,4,1,1,1,2
149,1190394,4,1,1,1,2,3,1,1,1,2
62,1116192,1,1,1,1,2,1,2,1,1,2


In [25]:
features = ['id', 'Clump Thickness', 'UofCSize', 'UofCShape', 'Marginal Adhesion', 'SECSize', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']

X_val = Val_df.loc[:, features]
y_val = Val_df.loc[:,'Class'].values

In [26]:
score = cv.score(X_val, y_val)
print(score)

0.9514563106796117


In [27]:
from sklearn import metrics
predictions = cv.predict(X_val)

In [28]:
print(metrics.classification_report(predictions,y_val))

              precision    recall  f1-score   support

           2       0.98      0.94      0.96        69
           4       0.89      0.97      0.93        34

    accuracy                           0.95       103
   macro avg       0.94      0.96      0.95       103
weighted avg       0.95      0.95      0.95       103



In [29]:
cv.best_estimator_

RandomForestClassifier(max_depth=4, n_estimators=50)

## Writing out the picked Model

In [16]:
joblib.dump(cv.best_estimator_, 'cancer_bag_model.pkl')

['cancer_bag_model.pkl']