# Stacking Model

## Reading the dataset

In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

tr_dataset = pd.read_csv('train_dataset.csv',index_col=[0])

In [2]:
tr_dataset.head()

Unnamed: 0,id,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Outcome,Class
337,837082,2,1,1,1,2,1,3,1,1,2,2
226,1231706,8,4,6,3,3,1,4,3,1,2,2
361,636437,1,1,1,1,2,1,1,1,1,2,2
381,1168278,3,1,1,1,2,1,2,1,1,2,2
454,1181685,1,1,2,1,2,1,2,1,1,2,2


## Creating Dataframe

In [9]:
features = ['id', 'Clump Thickness', 'UofCSize', 'UofCShape', 'Marginal Adhesion', 'SECSize', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']


X_train = tr_dataset.loc[:, features]
y_train = tr_dataset.loc[:,'Class'].values

## Hyperparameter Tuning

In [10]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [11]:
estimators = [('rf', RandomForestClassifier()),
              ('gb', GradientBoostingClassifier())]

sc = StackingClassifier(estimators=estimators)

In [12]:
parameters = {
    'gb__n_estimators': [50, 100],
    'rf__n_estimators': [50, 100],
    'final_estimator': [LogisticRegression(C=0.1, max_iter=1000),
                        LogisticRegression(C=1,max_iter = 1000),
                        LogisticRegression(C=10, max_iter=1000)],
    'passthrough': [True, False]
}
cv = GridSearchCV(sc, parameters, cv=5)
cv.fit(X_train, y_train.ravel())

print_results(cv)

BEST PARAMS: {'final_estimator': LogisticRegression(C=10, max_iter=1000), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 50}

0.713 (+/-0.256) for {'final_estimator': LogisticRegression(C=0.1, max_iter=1000), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 50}
0.713 (+/-0.256) for {'final_estimator': LogisticRegression(C=0.1, max_iter=1000), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 100}
0.962 (+/-0.022) for {'final_estimator': LogisticRegression(C=0.1, max_iter=1000), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 50}
0.958 (+/-0.023) for {'final_estimator': LogisticRegression(C=0.1, max_iter=1000), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 100}
0.713 (+/-0.256) for {'final_estimator': LogisticRegression(C=0.1, max_iter=1000), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 50}
0.713 (+/-0.256) for {'final_estimator': LogisticRegression(C=0.1, max_iter=1000), 'gb__n_

## Checking Models performance on Validation dataset

### Reading the validation dataset

In [13]:
Val_df = pd.read_csv('validation_dataset.csv', index_col=[0])

In [14]:
features = ['id', 'Clump Thickness', 'UofCSize', 'UofCShape', 'Marginal Adhesion', 'SECSize', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']

X_val = Val_df.loc[:, features]
y_val = Val_df.loc[:,'Class'].values

In [15]:
score = cv.score(X_val, y_val)
print(score)

0.9611650485436893


In [16]:
from sklearn import metrics
predictions = cv.predict(X_val)

In [17]:
print(metrics.classification_report(predictions,y_val))

              precision    recall  f1-score   support

           2       0.98      0.96      0.97        68
           4       0.92      0.97      0.94        35

    accuracy                           0.96       103
   macro avg       0.95      0.96      0.96       103
weighted avg       0.96      0.96      0.96       103



### Writing out the pickled model

In [18]:
joblib.dump(cv.best_estimator_, 'cancer_stk_model.pkl')

['cancer_stk_model.pkl']