# Adaptive Boosting

### Importing Libraries

In [1]:
#Importing required libraries
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

### Loading the dataset

In [2]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

In [3]:
#first five rows of the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


### Separating independent and dependent variables.

In [4]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

### Creating the train and test dataset

In [5]:
#import the train-test split
from sklearn.model_selection import train_test_split

In [6]:
#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

## Building an AdaBoost Model

In [7]:
#Importing AdaBoost Classifier 
from sklearn.ensemble import AdaBoostClassifier

In [8]:
#creating an AdaBoost instance
clf = AdaBoostClassifier(random_state=96)

In [9]:
#training the model
clf.fit(train_x,train_y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=96)

In [10]:
#calculating score on training data
clf.score(train_x, train_y)

0.8413173652694611

In [11]:
#calculating score on test data
clf.score(test_x, test_y)

0.7982062780269058

# Hyperparameter Tuning

1. **base_estimator:** The model to ensemble. Default is decision tree.
2. **n_estimators:** Total number of models to build.
3. **learning_rate:** Shrinks the contribution of each classifier by this value.
4. **random_state:** The random number seed so that same random numbers are generated every time.

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
clf = AdaBoostClassifier(random_state=96, base_estimator=RandomForestClassifier(random_state=101),
                         n_estimators=100, learning_rate=0.01)

In [14]:
#training the model
clf.fit(train_x,train_y)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=101, verbose=0, warm_start=False),
          learning_rate=0.01, n_estimators=100, random_state=96)

In [15]:
#calculating score on test data
clf.score(test_x, test_y)

0.7488789237668162