# Boosting with Python

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [None]:
df = pd.read_csv('diabetes.csv')
X = df.drop('Outcome',axis=1)
y = df['Outcome']
X_col=X.columns
df

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=1000,max_depth=1,learning_rate=1)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# There is a trade-off between number of trees and learning rate. 
# You should never check the test performance when tuning parameters!!! This is only for illustrative purposes

In [None]:
model = GradientBoostingClassifier(n_estimators=1000,max_depth=1,learning_rate=0.1)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
model = GradientBoostingClassifier(n_estimators=1000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
model = GradientBoostingClassifier(n_estimators=5000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Parameter Tuning

In [None]:
model = GradientBoostingClassifier(max_depth=1)
#number of trees fit
n_estimators = [100,1000,5000,10000]
#learning rate
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=GradientBoostingClassifier(n_estimators=5000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
GradientBoostingClassifier()

# Adaboost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

In [None]:
n_estimators = [50,100,1000,5000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=AdaBoostClassifier(n_estimators=100,learning_rate=0.1)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Extreme Gradient Booster: A popular library that implements boosting faster and generally provides better performance

In [None]:
#pip install XGBoost

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
n_estimators = [50,100,1000,5000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=XGBClassifier(n_estimators=100,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Regression with Gradient boosting

In [None]:
df = pd.read_csv('Hitters_Data.csv')
df=df.dropna()
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = np.log(df.Salary)

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')

# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis = 1)


In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(max_depth=1)
n_estimators = [100,1000,5000,10000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=GradientBoostingRegressor(n_estimators=1000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

# Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()

In [None]:
n_estimators = [100,1000,5000,10000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=AdaBoostRegressor(n_estimators=5000,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

# XGBoostRegressor

In [None]:
from xgboost import XGBRegressor
model=XGBRegressor()
n_estimators = [100,1000,5000,10000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=XGBRegressor(n_estimators=5000,learning_rate=0.1)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))