# Principal Component Ananlysis and Gradient Boosting Classification
As we are provided with a high dimensional data, we perform Principal Component Analysis for Dimensionality Reduction, retaining only the necessary features for classifying data and discard the redundant features. We then feed in the results of this model to Gradient Boosting Classifier, which operates on several weak learning models, to generate a stronger model by rectifying errors in the weaker models. We further improve this approach by Hyparameter tuning, and tune the parameters of Gradient Boosting Classifier to obtain the best results. 

In [2]:
import numpy as np
import pandas as pd
import csv
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
print('done')

done


In [3]:
df= pd.read_csv('Insurance_Train.csv')
df1= pd.read_csv('Insurance_Test.csv')
df.head(5), df.shape , df1.head(5), df1.shape

(      id  age          job  marital  education  balance housing loan  \
 0  98749   32       admin.   single  secondary       64     yes   no   
 1  19550   45  blue-collar  married  secondary      534      no   no   
 2  75084   45   technician  married  secondary     1477     yes   no   
 3  65715   39   technician  married   tertiary       14      no   no   
 4  41412   49  blue-collar   single    unknown     2222      no   no   
 
     contact month  duration  campaign  pdays  previous poutcome  y  
 0   unknown   may       202         2     -1         0  unknown  0  
 1  cellular   aug       104         6     -1         0  unknown  0  
 2  cellular   nov        75         1    132         1  failure  0  
 3  cellular   jan       114         2     -1         0  unknown  0  
 4   unknown   jun       114         2     -1         0  unknown  0  ,
 (40689, 16),
      id  age          job  marital  education  balance housing loan   contact  \
 0  5149   42       admin.   single  second

In [4]:
df_x_train= df.iloc[:,0:15]
df_y_train= df.iloc[:,15]
df_x_train

Unnamed: 0,id,age,job,marital,education,balance,housing,loan,contact,month,duration,campaign,pdays,previous,poutcome
0,98749,32,admin.,single,secondary,64,yes,no,unknown,may,202,2,-1,0,unknown
1,19550,45,blue-collar,married,secondary,534,no,no,cellular,aug,104,6,-1,0,unknown
2,75084,45,technician,married,secondary,1477,yes,no,cellular,nov,75,1,132,1,failure
3,65715,39,technician,married,tertiary,14,no,no,cellular,jan,114,2,-1,0,unknown
4,41412,49,blue-collar,single,unknown,2222,no,no,unknown,jun,114,2,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40684,24953,29,technician,single,secondary,3313,yes,no,unknown,jun,18,3,-1,0,unknown
40685,34002,59,admin.,married,secondary,92,yes,no,cellular,may,139,2,350,1,failure
40686,76035,54,blue-collar,married,secondary,548,no,no,cellular,aug,520,7,-1,0,unknown
40687,61279,46,blue-collar,married,primary,258,yes,no,unknown,may,217,1,-1,0,unknown


In [5]:
df_encoded= pd.get_dummies(df_x_train)
df1_encoded= pd.get_dummies(df1)
df_encoded.head(2), df.shape , df1_encoded.head(2), df1.shape

(      id  age  balance  duration  campaign  pdays  previous  job_admin.  \
 0  98749   32       64       202         2     -1         0           1   
 1  19550   45      534       104         6     -1         0           0   
 
    job_blue-collar  job_entrepreneur  ...  month_jun  month_mar  month_may  \
 0                0                 0  ...          0          0          1   
 1                1                 0  ...          0          0          0   
 
    month_nov  month_oct  month_sep  poutcome_failure  poutcome_other  \
 0          0          0          0                 0               0   
 1          0          0          0                 0               0   
 
    poutcome_success  poutcome_unknown  
 0                 0                 1  
 1                 0                 1  
 
 [2 rows x 49 columns],
 (40689, 16),
      id  age  balance  duration  campaign  pdays  previous  job_admin.  \
 0  5149   42      734       332         2    317         3           1 

In [6]:
data = df_encoded.astype(float)
data_y = df_y_train.astype(float)
data_test = df1_encoded.astype(float)
type(data) , data.shape , data.head(2), type(data_test), data_test.shape , data_test.head(2), data_y

(pandas.core.frame.DataFrame,
 (40689, 49),
         id   age  balance  duration  campaign  pdays  previous  job_admin.  \
 0  98749.0  32.0     64.0     202.0       2.0   -1.0       0.0         1.0   
 1  19550.0  45.0    534.0     104.0       6.0   -1.0       0.0         0.0   
 
    job_blue-collar  job_entrepreneur  ...  month_jun  month_mar  month_may  \
 0              0.0               0.0  ...        0.0        0.0        1.0   
 1              1.0               0.0  ...        0.0        0.0        0.0   
 
    month_nov  month_oct  month_sep  poutcome_failure  poutcome_other  \
 0        0.0        0.0        0.0               0.0             0.0   
 1        0.0        0.0        0.0               0.0             0.0   
 
    poutcome_success  poutcome_unknown  
 0               0.0               1.0  
 1               0.0               1.0  
 
 [2 rows x 49 columns],
 pandas.core.frame.DataFrame,
 (4522, 49),
        id   age  balance  duration  campaign  pdays  previous  j

In [7]:
data_train= data.values
data_train_y= data_y.values
data_test = data_test.values
data_train.shape , data_test.shape

((40689, 49), (4522, 49))

In [8]:
x_train , y_train = data_train[:,1:], data_train_y
x_test = data_test[:,1:]
x_train.shape, y_train.shape, x_test.shape, x_train[:2], y_train, x_test[:2]

((40689, 48),
 (40689,),
 (4522, 48),
 array([[ 32.,  64., 202.,   2.,  -1.,   0.,   1.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,
           1.,   0.,   0.,   0.,   1.,   1.,   0.,   0.,   0.,   1.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,
           0.,   0.,   0.,   1.],
        [ 45., 534., 104.,   6.,  -1.,   0.,   0.,   1.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,
           1.,   0.,   0.,   1.,   0.,   1.,   0.,   1.,   0.,   0.,   0.,
           1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   1.]]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([[ 42., 734., 332.,   2., 317.,   3.,   1.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,
           1.,   0.,   0.,   0.,   1.,   1.,   0.,   1.,   0.,   0.,   1.,
           0.,   0.,   0.,  

# Principal Component Analysis
Principal Component Analysis involves the orthogonal projection of data onto a lower dimensional linear subspace, such that variance of the projected data is maximized. Mathematical approaches to maximize variance, leads us to the conclusion that the data needs to be projected onto the, say, top-k eigenvectors corresponding to top-k largest eigenvalues of the data covariance matrix. We find that this lower dimensional representation of data is sufficient to accurately classify data, and the rest of the features can be safely discarded, thus removing reduntancy and improving efficiency. 

In [9]:
pca = PCA(n_components=20)

pca.fit(data_train)

train_pca = pca.transform(data_train)
train_pca.shape

(40689, 20)

In [10]:
pca = PCA(n_components=20)

pca.fit(data_test)

test_pca = pca.transform(data_test)
test_pca.shape

(4522, 20)

# Gradient Boosting Classifier 
The Principal idea behind this algorithm is to build models sequentially, where the subsequent models try to reduce the errors of the previous models. 
To understand it better, say we feed in the observations to an initial model assuming all observations have equal weights or probabilities. This model is bound to have less accuracy and a lot of misclassifications due to the assumption that all features have equal probabilities. Now, based on these misclassifications, we feed in the observation to a second model with updated weights; i.e. we increase the weights of wrongely classified observations and decrease the weights of correctly classified ones. The same procedure is repeated till we minimize error to get a model with higher accuracy. 


The loss function for this algorithm is given as: \\
\begin{align}
L= -Σ_{i=1}^n [y_ilog(p) + (1-y_i)log(1-p)] 
\end{align}
\begin{align}
L= -[y^*log(p) + (1-y^*)log(1-p)] 
\end{align}
\begin{align}
L= -y^*log(odds) + log(1-e^{log(odds)}) 
\end{align}
Now, differentiating w.r.t. *log(odds)* : \\
\begin{align}
\frac{dL}{d[log(odds)]}= -y + p = Prediction - Observation
\end{align}
\begin{align}
Residual= Observation - Prediction 
\end{align}
\
A decision tree is constructed to forecast the estimated residuals, where the value of each leaf is updated as:
\begin{align}
γ = \frac{Σ_{i=1}^n Residual_i}{Σ_{i=1}^nPreviousProbability_i*(1-PreviousProbability_i)}
\end{align}
\
Then,the log forecast for each training set instance is obtained and transformed into a probability. The new predictions are obtained as:
\begin{align}
New Prediction= Previous Prediction + (Learning Rate * Prediction Residual Value)
\end{align}

In [11]:
def gbm(number_of_estimators,learn_rate,subsample):
    number_of_estimators=int(number_of_estimators)
    subsample=subsample 
    return GradientBoostingClassifier(
    n_estimators=number_of_estimators,  # Number of estimators (trees) to use
    learning_rate=learn_rate,  # Learning rate for updates
    subsample=subsample,  # Subsample ratio for training each tree
    random_state=42  # Random seed for reproducibility) 
    ).fit(train_pca, y_train)

# Hyperparameter Tuning using Bayesian Optimization
We implement Bayesian optimization, which is a type of probabilistic optimization algorithm used to find the optimal hyperparameters for a machine learning model. In this specific case, the code optimizes hyperparameters for gradient boosting classifier.

The objective_function is defined as the function that takes the hyperparameters (learning_rate, n_estimators, and subsample) and returns the mean F1 score obtained by performing cross-validation with those hyperparameters on the training data. The F1 score is a metric that combines precision and recall, and it is often used in classification tasks to evaluate the performance of a model. The pbounds dictionary specifies the ranges for the hyperparameters.The BayesianOptimization class from the bayes_opt module is used to optimize the hyperparameters. 

Finally, the best hyperparameters and the maximum F1 score achieved are printed using the max attribute of the optimizer object. This dictionary can be used to train the final gradient boosting classifier with the optimal hyperparameters and to evaluate its performance on the test data.

In [12]:
def objective_function(learning_rate, n_estimators, subsample):
    # Define gradient boosting classifier with specified hyperparameters
    clf = GradientBoostingClassifier(learning_rate=learning_rate,
                                     n_estimators=int(n_estimators),
                                    #  max_depth=int(max_depth),
                                     subsample=subsample,
                                    #  min_samples_split=int(min_ss),
                                    #  min_samples_leaf=int(min_sl),
                                     random_state=42)
    f1_scores = cross_val_score(clf, train_pca, y_train, cv=5, scoring='f1')
    # Return mean F1 score as optimization target
    return np.mean(f1_scores)

# Define bounds for hyperparameters to optimize
pbounds = {'learning_rate': (0.01, 0.1),
           'n_estimators': (100, 1000),
           'subsample': (0.5, 1.0),}
# Initialize Bayesian optimization instance with objective function and bounds
optimizer = BayesianOptimization(f=objective_function, pbounds=pbounds, random_state=42)

# Optimize hyperparameters for maximum F1 score
optimizer.maximize(init_points=5, n_iter=20)
# Print best hyperparameters and maximum F1 score
print(optimizer.max)

|   iter    |  target   | learni... | n_esti... | subsample |
-------------------------------------------------------------


KeyboardInterrupt: 

| 1         | 0.4851    | 0.04371   | 955.6     | 0.866     |

In [13]:
y_pred=gbm(956, 0.04371, 0.866).predict(test_pca)
type(y_pred), test_pca.shape 

In [None]:
y_pred= y_pred.astype(float)
y_pred= y_pred.astype(int)
y_pred_f= y_pred.reshape(-1,1)
y_pred_f, y_pred_f.shape, type(y_pred_f)

In [None]:
data_f= df1.to_numpy()
data_f

In [None]:
array1_str = [list(map(str, sublist)) for sublist in data_f[:,:1]]
array2_str = [list(map(str, sublist)) for sublist in y_pred_f]

result = [array1_str[i] + array2_str[i] for i in range(len(array1_str))]
result

In [None]:
result= np.array(result)
result, type(result), result.shape, data_f.shape, data.shape

In [None]:
x_desired = result.tolist()
x_desired, type(x_desired)

In [None]:
head_result = [['id','y']]
x_desired_final= head_result+x_desired
x_desired_final

In [None]:
filename = 'result_gbm.csv'

# Write the processed array to the CSV file
with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(x_desired_final)

print(f'Successfully wrote to {filename}.')