### 17.9.3  - Boosting in Practice

In [25]:
 import pandas as pd
from path import Path
file_path = Path("ML_data01df_CC.csv")
ccml4_df = pd.read_csv(file_path)
ccml4_df

Unnamed: 0.1,Unnamed: 0,METAREA,COUNTY,AGE,COVIDUNAW,month_le,gender_Female,gender_Male,education_Bachelor's,education_Graduate or Professional Degree,...,education_Some College or Associate Degree,race_Asian,race_Black,race_Native American,race_White,hispanic_Hispanic,hispanic_Non-Hispanic,marital_status_Divorced,marital_status_Married,marital_status_Single
0,0,3122,0,54,1,5,0,1,1,0,...,0,0,0,0,1,0,1,0,0,1
1,1,3121,37067,51,1,5,1,0,0,0,...,1,0,0,0,1,0,1,1,0,0
2,2,3121,37067,49,1,5,0,1,1,0,...,0,0,0,0,1,0,1,0,0,1
3,3,1521,37119,61,1,5,0,1,0,0,...,0,0,0,0,1,0,1,0,1,0
4,4,3122,0,35,1,5,1,0,1,0,...,0,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19200,19200,2980,37191,30,1,24,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
19201,19201,480,37021,53,1,24,0,1,0,1,...,0,0,0,0,1,0,1,0,1,0
19202,19202,480,37021,45,1,24,1,0,1,0,...,0,0,0,0,1,0,1,0,1,0
19203,19203,2980,37191,24,1,24,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [26]:
# Then, separate the feature columns from the target column.
X = ccml4_df.copy()
X = X.drop("COVIDUNAW", axis=1)
y = ccml4_df["COVIDUNAW"].values


In [27]:
# Next, split the dataset into training and testing sets. Again, the random_state argument is optional.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1)

In [28]:
# The data is scaled in the next step.
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# In the next step, a for loop is used to identify the learning rate that yields the best performance.
# Be sure to include the print statement.

from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_train_scaled, y_train)

   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))
 
    
    



Learning rate:  0.05
Accuracy score (training): 0.955
Accuracy score (validation): 0.955
Learning rate:  0.1
Accuracy score (training): 0.955
Accuracy score (validation): 0.955
Learning rate:  0.25
Accuracy score (training): 0.955
Accuracy score (validation): 0.954
Learning rate:  0.5
Accuracy score (training): 0.956
Accuracy score (validation): 0.954
Learning rate:  0.75
Accuracy score (training): 0.956
Accuracy score (validation): 0.953
Learning rate:  1
Accuracy score (training): 0.956
Accuracy score (validation): 0.953


In [30]:
# Using the learning_rate value obtained from the for loop, we instantiate a model, train it, then create predictions.
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)


In [31]:
# Having created predictions with the gradient boosted tree model, we can assess the model's performance. 
# This time, the accuracy_score() method is used.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")


Accuracy Score : 0.9541857559350271


In [32]:
# generate a confusion_matrix of the results.
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4580,7
Actual 1,213,2


In [33]:
# Finally, we can generate a classification report to evaluate the precision, recall, and F1 scores.
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           1       0.96      1.00      0.98      4587
           2       0.22      0.01      0.02       215

    accuracy                           0.95      4802
   macro avg       0.59      0.50      0.50      4802
weighted avg       0.92      0.95      0.93      4802



### 17.10.1 - Oversampling

In [34]:
X = ccml4_df.copy()
X = X.drop("COVIDUNAW", axis=1)
X.head()

Unnamed: 0.1,Unnamed: 0,METAREA,COUNTY,AGE,month_le,gender_Female,gender_Male,education_Bachelor's,education_Graduate or Professional Degree,education_High School or below,education_Some College or Associate Degree,race_Asian,race_Black,race_Native American,race_White,hispanic_Hispanic,hispanic_Non-Hispanic,marital_status_Divorced,marital_status_Married,marital_status_Single
0,0,3122,0,54,5,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1
1,1,3121,37067,51,5,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0
2,2,3121,37067,49,5,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1
3,3,1521,37119,61,5,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0
4,4,3122,0,35,5,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0


In [35]:
# Define the target set.
y = ccml4_df["COVIDUNAW"].values
y[:5]

array([1, 1, 1, 1, 1])

In [36]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report 

In [37]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y_train)

Counter({1: 13766, 2: 637})

In [38]:
# Next, we randomly oversample the minority class with the imblearn library.
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({1: 13766, 2: 13766})

In [39]:
# Verify that the minority class has been enlarged.



In [40]:
# With a resampled dataset, we can now carry out the familiar pattern of training a model, 
# making predictions, and evaluating the model's performance. 
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
# For this example, we'll use a LogisticRegression model.







LogisticRegression(random_state=1)

In [41]:
# The model creates predictions. We then generate a confusion_matrix with the results.
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[2954, 1620],
       [  70,  158]])

In [42]:
# To assess the accuracy score of the model, we'll use the balanced_accuracy_score module.
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)


0.6694033400072108

In [43]:
# We'll use the classification_report_imbalanced to do so.
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

          1       0.98      0.65      0.69      0.78      0.67      0.45      4574
          2       0.09      0.69      0.65      0.16      0.67      0.45       228

avg / total       0.93      0.65      0.69      0.75      0.67      0.45      4802



In [44]:
# Note that the following code is contained in the same Jupyter Notebook as the random oversampling example, 
# and that we are using the same training data (X_train and y_train). We use the SMOTE module from the imblearn 
# library to oversample the minority class.
from imblearn.over_sampling import SMOTE
X_SMOTE, y_SMOTE = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)


In [45]:
Counter(y_SMOTE)

Counter({1: 13766, 2: 13766})

In [24]:
# We'll again train a LogisticRegression model, predict, then assess the accuracy and generate a confusion_matrix, 
# as shown in the following code blocks.
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_SMOTE, y_SMOTE)

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

confusion_matrix(y_test, y_pred)

print(classification_report_imbalanced(y_test, y_pred))





                   pre       rec       spe        f1       geo       iba       sup

          1       0.96      0.96      0.08      0.96      0.28      0.08      4587
          2       0.08      0.08      0.96      0.08      0.28      0.07       215

avg / total       0.92      0.92      0.12      0.92      0.28      0.08      4802



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
