# Boosting Model

### Reading the Data

In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

tr_dataset = pd.read_csv('train_dataset.csv',index_col=[0])

In [2]:
tr_dataset.head()

Unnamed: 0,id,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Outcome,Class
337,837082,2,1,1,1,2,1,3,1,1,2,2
226,1231706,8,4,6,3,3,1,4,3,1,2,2
361,636437,1,1,1,1,2,1,1,1,1,2,2
381,1168278,3,1,1,1,2,1,2,1,1,2,2
454,1181685,1,1,2,1,2,1,2,1,1,2,2


## Creating Dataframe

In [3]:
features = ['id', 'Clump Thickness', 'UofCSize', 'UofCShape', 'Marginal Adhesion', 'SECSize', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']


X_train = tr_dataset.loc[:, features]
y_train = tr_dataset.loc[:,'Class'].values

## Hyperparameter Tuning 

In [4]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [5]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [160],
    'max_depth': [1, 2],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 10, 100]
}
cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(X_train, y_train.ravel())

print_results(cv)

BEST PARAMS: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 160}

0.651 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 160}
0.651 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 160}
0.952 (+/-0.017) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 160}
0.96 (+/-0.028) for {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 160}
0.969 (+/-0.023) for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 160}
0.962 (+/-0.021) for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 160}
0.967 (+/-0.015) for {'learning_rate': 0.2, 'max_depth': 1, 'n_estimators': 160}
0.956 (+/-0.015) for {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 160}
0.962 (+/-0.01) for {'learning_rate': 0.3, 'max_depth': 1, 'n_estimators': 160}
0.954 (+/-0.028) for {'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 160}
0.956 (+/-0.02) for {'learning_rate': 0.4, 'max_depth': 1, 'n_estimators': 160}
0.956 (+/-0.009) for {'learning_

## Checking Models Performance on Validation dataset

### Reading validation dataset

In [7]:
Val_df = pd.read_csv('validation_dataset.csv', index_col=[0])

In [8]:
features = ['id', 'Clump Thickness', 'UofCSize', 'UofCShape', 'Marginal Adhesion', 'SECSize', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']

X_val = Val_df.loc[:, features]
y_val = Val_df.loc[:,'Class'].values

In [10]:
score = cv.score(X_val, y_val)
print(score)

0.9514563106796117


In [11]:
from sklearn import metrics
predictions = cv.predict(X_val)

In [12]:
print(metrics.classification_report(predictions,y_val))

              precision    recall  f1-score   support

           2       0.98      0.94      0.96        69
           4       0.89      0.97      0.93        34

    accuracy                           0.95       103
   macro avg       0.94      0.96      0.95       103
weighted avg       0.95      0.95      0.95       103



In [13]:
cv.best_estimator_

GradientBoostingClassifier(max_depth=1, n_estimators=160)

## Writing out the picked Model

In [14]:
joblib.dump(cv.best_estimator_, 'cancer_boost_model.pkl')

['cancer_boost_model.pkl']