# Importing Libraries
in this part we will install all the necessary libraries on command prompt and then import the necessary functions from those libraries. 

In [1]:
# importing all the necessary libraries
import pandas as pd

from numpy import mean
import numpy as np
import time

# step 1: preprocessing
from sklearn.impute import SimpleImputer # import some strategic imputer to fill in any missing values using mean
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, Normalizer # scale all the values to one range to avoid any biasness (this bias is seen in mostly naive bayes and knn etc)

from sklearn.impute import KNNImputer # import some strategic imputer to fill missing values using KNN (finds the nearest neighbour and fills it with that value)

from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, VarianceThreshold

# step 2: data division
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV, ParameterGrid # to divide the code into train/test using a specific percentage or with/without replacement

# step 3: model
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier

# step 4: displaying accuracy
from sklearn.metrics import roc_auc_score, accuracy_score # to display the accuracy of our tree

# step 5: warning filter
import warnings
warnings.filterwarnings('ignore')

In [2]:
# use this block to install any libraries not on the system
# !pip install pandas
# !pip install sklearn
# python -m pip install scikit-learn lightgbm xgboost catboost

# Data Loading
data shall be loaded into variables as data sets using pandas and csv readers. they will be checked to see if they are loaded properly and will be loaded as 2 sets: train and test as per given in the kaggle data

In [3]:
# lets load the training data set
train_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger1\iml-fall-2024-challenge-1\train_set.csv")

# lets also check it by getting the first few rows of the data, there should be x1 - x78 and one target variable Y
train_data.head() 

Unnamed: 0,RecordId,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X70,X71,X72,X73,X74,X75,X76,X77,X78,Y
0,1,87.0,34.118411,0,2,0,165.1,1,829,2,...,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2,82.372284,31.57328,0,0,1,162.983897,1,724,0,...,0.033431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3,50.0,27.771653,0,0,1,165.1,1,895,2,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,4,66.236109,26.515922,0,0,1,167.009549,1,637,0,...,0.039363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,5,81.303299,20.843691,0,0,1,158.165419,0,564,0,...,0.069242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [4]:
# lets load the test data
test_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger1\iml-fall-2024-challenge-1\test_set.csv")

# check if the data has been loaded by getting the first 5 rows - there should be x1 - x78 and no target variable Y as this is test data
test_data.head() 

Unnamed: 0,RecordId,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X69,X70,X71,X72,X73,X74,X75,X76,X77,X78
0,300001,79.0,17.122318,0,0,1,170.2,1,700,0,...,0.07,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300002,38.0,43.693579,0,0,1,165.1,1,814,0,...,0.05,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,300003,36.064225,23.998944,0,0,1,167.086735,1,662,0,...,0.006948,0.006948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,300004,61.846764,31.693449,0,3,1,182.355708,2,862,0,...,0.062613,0.033153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,300005,71.591991,20.086147,1,0,1,166.704917,2,335,0,...,0.014854,0.004854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Preprocessing
before we start processing this data and using algorithms, we will fix this data first, this is called data preprocessing

## Conversion of Categorical to Numerical
First we will convert categorical data to numerical data by doing one hot encoding, which turns it into binary variables

In [5]:
# one hot encoding - display it
pd.get_dummies(train_data) # this line will convert the train_data to one hot encoding but it will only display the result and not save it

Unnamed: 0,RecordId,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X70,X71,X72,X73,X74,X75,X76,X77,X78,Y
0,1,87.000000,34.118411,0,2,0,165.100000,1,829,2,...,0.040000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0
1,2,82.372284,31.573280,0,0,1,162.983897,1,724,0,...,0.033431,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0
2,3,50.000000,27.771653,0,0,1,165.100000,1,895,2,...,0.010000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0
3,4,66.236109,26.515922,0,0,1,167.009549,1,637,0,...,0.039363,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0
4,5,81.303299,20.843691,0,0,1,158.165419,0,564,0,...,0.069242,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246117,246118,65.149110,33.357948,0,0,1,156.317941,1,711,0,...,0.027152,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0
246118,246119,48.000000,46.736176,0,0,1,157.000000,1,594,2,...,0.560000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,1
246119,246120,57.472080,41.854115,1,0,0,189.868698,2,455,0,...,0.020601,0.0,0.0,0.587987,0.0,0.412013,0.0,0.0,0.412013,0
246120,246121,66.000000,23.738662,1,0,1,168.000000,2,609,0,...,-1.000000,0.0,0.0,1.000000,0.0,0.000000,0.0,0.0,0.000000,0


In [6]:
# we can see that there is no change in the number of columns meaning there is no categorical data. but for the sake of running the program. we must perform the preprocessing therefore we shall re-run the one hot encoding and save it somewhere
train_data_processed = pd.get_dummies(train_data)

# now we shall do the same on the test data so that we maintain the rules over all data
test_data_processed = pd.get_dummies(test_data)

## Data Splitting - festures and targets
the data in train_data set is of x1 - x78 columns (79 variables) and one target variable (Y). we must split that data so that we can perform data preprocessing on the features variables (will be referred to as X).

In [7]:
# so in X, it is ALL the columns EXCEPT the last column known as 'Y' (we can confirm this using the train_data.head() we did earlier) so we must get all columns and DROP only the 'y' column
X = train_data_processed.drop(columns=['Y'])
X # lets display X and see what it is now

Unnamed: 0,RecordId,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X69,X70,X71,X72,X73,X74,X75,X76,X77,X78
0,1,87.000000,34.118411,0,2,0,165.100000,1,829,2,...,0.110000,0.040000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
1,2,82.372284,31.573280,0,0,1,162.983897,1,724,0,...,0.100292,0.033431,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
2,3,50.000000,27.771653,0,0,1,165.100000,1,895,2,...,0.020000,0.010000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
3,4,66.236109,26.515922,0,0,1,167.009549,1,637,0,...,0.108249,0.039363,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
4,5,81.303299,20.843691,0,0,1,158.165419,0,564,0,...,0.164645,0.069242,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246117,246118,65.149110,33.357948,0,0,1,156.317941,1,711,0,...,0.088610,0.027152,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
246118,246119,48.000000,46.736176,0,0,1,157.000000,1,594,2,...,-1.000000,0.560000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
246119,246120,57.472080,41.854115,1,0,0,189.868698,2,455,0,...,0.032961,0.020601,0.0,0.0,0.587987,0.0,0.412013,0.0,0.0,0.412013
246120,246121,66.000000,23.738662,1,0,1,168.000000,2,609,0,...,0.020000,-1.000000,0.0,0.0,1.000000,0.0,0.000000,0.0,0.0,0.000000


In [8]:
# so as per our X output, we can see that number of columns in train_data is 79 and number of columns in X is 78 meaning we have successfully performed our removal of target variable
# now to get the target variable alone, we can just get it alone,
Y = train_data_processed['Y']
Y # lets see what it is
# as per our Y output, we can see it is of one column and 246k rows which means we have successfully extracted the target variable column

0         0
1         0
2         0
3         0
4         0
         ..
246117    0
246118    1
246119    0
246120    0
246121    0
Name: Y, Length: 246122, dtype: int64

## Data Imputation 
many cells in our data may be empty - we must fill these cells with data. we have multiple options to deal with them:
- we remove the entire rows (Case 1)
- we fill the cells with the average of the column (Case 2)
- we fill the cells based on KNN imputation (nearest neighbour) (Case 3)

In [9]:
# REMOVE ROWS 
# ----------------------------- case  -----------------------------
# in this case, lets remove the entire rows that have NaN values. before saving the removed rows data set, lets first run it and display it to see the outcome, then we shall save in X
# X.dropna(axis=0)

In [10]:
# REMOVE ROWS
# # so we originally had 246122 rows and now after removing empty cell rows we have 239650 rows which is a 6472 rows difference. as our first try, lets work with it. lets assign this data set in place of X
# X = X.dropna(axis=0)
# X
# these above 2 lines were commented out as there was an error handling, rows were being removed from X and not from Y so we fixed it by removing from train_data and then splitting into X and Y
# train_data_processed = train_data_processed.dropna(axis=0)

In [11]:
# Average Mean Imputation
# ----------------------------- case  -----------------------------
# this will fill all the empty spaces using the average of all the spaces
imputer = SimpleImputer(strategy='mean')

In [12]:
# KNN Imputation
# ----------------------------- case -----------------------------
# this fills them in using k-nearest neighbours of all the spaces
# imputer = KNNImputer(n_neighbors=7)

In [13]:
X = imputer.fit_transform(X)                                        # fill them in X
test_data_processed = imputer.fit_transform(test_data_processed)    # fill them in test data

## Data Scaling
some columns may be very large then other columns when compared. it would not affect at the moment as we are using decision trees, but to maintain a fair enviroment, we shall perform scaling on every run.
there are two types of scaling: 
- min max scaling (also known as normalization)
- standardisation (z-score normalization)
- max abs scaler
- robust scaler
- normalizer

In [14]:
# ----------------------------- case  -----------------------------
# in this case we shall perform min max scaling. to do that, we must use our MinMaxScaler that we have imported above
# scaler = MinMaxScaler()
# # now we must use this scaler to scale X
# scaler.fit_transform(X)

In [15]:
# ----------------------------- case  -----------------------------
scaler = MaxAbsScaler()
# now we must use this scaler to scale X
scaler.fit_transform(X)

array([[4.06302565e-06, 9.77528090e-01, 5.03110176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.12605131e-06, 9.25531276e-01, 4.65579663e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.21890770e-05, 5.61797753e-01, 4.09520864e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [9.99991874e-01, 6.45753712e-01, 6.17180876e-01, ...,
        0.00000000e+00, 0.00000000e+00, 4.12013395e-01],
       [9.99995937e-01, 7.41573034e-01, 3.50050368e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 5.64692584e-01, 4.82989245e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [16]:
# our output shows us that every value in the array is between 0 and 1. thus lets save this value on X
X = scaler.fit_transform(X)

# now we must do the same on our test_data set
test_data_processed = scaler.fit_transform(test_data_processed)

# Filters
there are two types of filters to filter out columns/features:
- variance filter (a column which has same values throughout the column like all are sunny)
- correlation filter (two columns which are same like weight in kg and weight in pounds)

In [17]:
print("X : ", X.shape)
print("test data : ", test_data_processed.shape)

X :  (246122, 78)
test data :  (105482, 78)


In [18]:
# variance filter
# ----------------------------- case  -----------------------------
# variance_filter = VarianceThreshold(threshold=0.001)  # Adjust the threshold if needed
# X = variance_filter.fit_transform(X)
# test_data_processed = variance_filter.fit_transform(test_data_processed)
X.shape

(246122, 78)

In [19]:
test_data_processed.shape

(105482, 78)

In [20]:
# # correlation filter
# # ----------------------------- case  -----------------------------
# corr_matrix = pd.DataFrame(X).corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
# X = pd.DataFrame(X).drop(columns=to_drop)
# test_data_processed = pd.DataFrame(test_data_processed).drop(columns=to_drop)
X.shape

(246122, 78)

In [21]:
test_data_processed.shape

(105482, 78)

## Data Splitting - train and validate
now our test_data set is of rows with NO target variable whereas the train_data set is WITH target variable.
our rules in machine learning is that we must train half or 70% of the data and then we must check its accuracy using the remaining half or 30% of the data - we can only check accuracy IF we have the answers i.e. the target variable. 
So, what we need to do is, is split the train_data set into 2, by a 70% and 30% ratio. we train the model using the 70% and then test the model using the 30% and then use that model to predict the test_data set.

In [22]:
# holdout method
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3)

## model intialization
here model is intialized

In [23]:
model = CatBoostClassifier()
# --
# model = CatBoostClassifier()                                      # case 107
# model = CatBoostClassifier(n_estimators=100)                      # case 109
# model = CatBoostClassifier(loss_function='Logloss', depth=10)     # case 111

# Feature Selection
in this we select columns and features we want to keep. there are several algos to do so:
- forward selection
- backward selection
- Kbest (best out of all)

In [24]:
# forward selection
# ----------------------------- case -----------------------------
# selection = SequentialFeatureSelector(model, direction='forward',n_features_to_select=10, scoring='roc_auc')

In [25]:
# # backward selection
# selection = SequentialFeatureSelector(model, direction='backward',n_features_to_select=5, scoring='roc_auc', n_jobs=-1)

In [26]:
# k best
# ----------------------------- case -----------------------------
# selection = SelectKBest(score_func=f_classif, k=5)             # Use f_classif for classification

In [27]:
# feature selection fitting
# trainX = selection.fit_transform(trainX, trainY)

In [28]:
# feature selection applying
# testX = selection.transform(testX)                                  # Ensure the test set is transformed similarly
# test_data_processed = selection.transform(test_data_processed)      # test data is also transformed
# X = selection.transform(X)                                          # full data transforming

In [29]:
trainX.shape

(172285, 78)

# Grid Search

In [30]:
# define hyper parameters of grid
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [31]:
# intialize grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=2)

In [32]:
# fit the model
grid_search.fit(trainX, trainY)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Learning rate set to 0.078094
0:	learn: 0.4898510	total: 166ms	remaining: 2m 45s
1:	learn: 0.3472742	total: 201ms	remaining: 1m 40s
2:	learn: 0.2483066	total: 232ms	remaining: 1m 17s
3:	learn: 0.1808429	total: 265ms	remaining: 1m 5s
4:	learn: 0.1340217	total: 287ms	remaining: 57.1s
5:	learn: 0.1022839	total: 308ms	remaining: 51s
6:	learn: 0.0783257	total: 327ms	remaining: 46.3s
7:	learn: 0.0629775	total: 343ms	remaining: 42.6s
8:	learn: 0.0514156	total: 358ms	remaining: 39.4s
9:	learn: 0.0426649	total: 374ms	remaining: 37s
10:	learn: 0.0366052	total: 389ms	remaining: 35s
11:	learn: 0.0321187	total: 415ms	remaining: 34.2s
12:	learn: 0.0287551	total: 440ms	remaining: 33.4s
13:	learn: 0.0260801	total: 474ms	remaining: 33.4s
14:	learn: 0.0241519	total: 506ms	remaining: 33.2s
15:	learn: 0.0225070	total: 529ms	remaining: 32.5s
16:	learn: 0.0212570	total: 551ms	remaining: 31.9s
17:	learn: 0.0202468	total: 575ms	remaining: 31.4s
18:	

In [33]:
# display the best model grid search found
best_model = grid_search.best_estimator_
best_model

<catboost.core.CatBoostClassifier at 0x2127ec6f290>

In [34]:
# display the best parameters of the best model
best_parameters = grid_search.best_params_
best_parameters

{'max_depth': 1}

In [44]:
# assign the best model our model
model = best_model

## Bagging intialization
here we will introduce and intialize bagging

In [45]:
model = BaggingClassifier(estimator=model, n_estimators=50, verbose=2)
# -- 
# model = BaggingClassifier(estimator=model, n_estimators=50)                   # case 109
# model = BaggingClassifier(estimator=model, n_estimators=100)                  # case 

## model running
here we run the model

In [46]:
# fit the model
model.fit(trainX, trainY)

Building estimator 1 of 50 for this parallel run (total 50)...
Learning rate set to 0.092856
0:	learn: 0.4587484	total: 56.5ms	remaining: 56.4s
1:	learn: 0.3056354	total: 80.3ms	remaining: 40.1s
2:	learn: 0.2070264	total: 104ms	remaining: 34.4s
3:	learn: 0.1447655	total: 123ms	remaining: 30.6s
4:	learn: 0.1038405	total: 139ms	remaining: 27.7s
5:	learn: 0.0779201	total: 158ms	remaining: 26.3s
6:	learn: 0.0603145	total: 181ms	remaining: 25.7s
7:	learn: 0.0485162	total: 202ms	remaining: 25s
8:	learn: 0.0398322	total: 227ms	remaining: 25s
9:	learn: 0.0338107	total: 255ms	remaining: 25.3s
10:	learn: 0.0292597	total: 285ms	remaining: 25.6s
11:	learn: 0.0262451	total: 310ms	remaining: 25.5s
12:	learn: 0.0240446	total: 334ms	remaining: 25.3s
13:	learn: 0.0222247	total: 353ms	remaining: 24.9s
14:	learn: 0.0208682	total: 375ms	remaining: 24.6s
15:	learn: 0.0199855	total: 394ms	remaining: 24.3s
16:	learn: 0.0191891	total: 415ms	remaining: 24s
17:	learn: 0.0185714	total: 437ms	remaining: 23.8s
18:

In [47]:
# predict using this model
y_pred = model.predict(testX)

In [48]:
# display the accuracy of this prediction
accuracy = accuracy_score(testY, y_pred)
print("model accuracy = ", accuracy, "   ")

# now lets calculate the ROC AUC score according to this prediction
roc_score = roc_auc_score(testY, y_pred)
print("roc score = ", roc_score, "   ")

model accuracy =  0.9976028278505357    
roc score =  0.5496287205207183    


## predict for test dataset
fit the model and predict for test dataset

In [49]:
model.fit(X, Y)

Building estimator 1 of 50 for this parallel run (total 50)...
Learning rate set to 0.108132
0:	learn: 0.4258338	total: 42.4ms	remaining: 42.3s
1:	learn: 0.2647553	total: 76.5ms	remaining: 38.1s
2:	learn: 0.1622779	total: 121ms	remaining: 40.1s
3:	learn: 0.1094792	total: 163ms	remaining: 40.5s
4:	learn: 0.0776115	total: 204ms	remaining: 40.6s
5:	learn: 0.0576706	total: 243ms	remaining: 40.2s
6:	learn: 0.0452446	total: 279ms	remaining: 39.6s
7:	learn: 0.0364759	total: 313ms	remaining: 38.8s
8:	learn: 0.0305244	total: 348ms	remaining: 38.4s
9:	learn: 0.0264148	total: 378ms	remaining: 37.4s
10:	learn: 0.0237963	total: 417ms	remaining: 37.5s
11:	learn: 0.0218026	total: 459ms	remaining: 37.8s
12:	learn: 0.0203426	total: 489ms	remaining: 37.1s
13:	learn: 0.0193562	total: 523ms	remaining: 36.9s
14:	learn: 0.0185814	total: 558ms	remaining: 36.7s
15:	learn: 0.0179952	total: 593ms	remaining: 36.5s
16:	learn: 0.0176338	total: 626ms	remaining: 36.2s
17:	learn: 0.0172481	total: 661ms	remaining: 36s

In [50]:
test_prediction = model.predict_proba(test_data_processed)

test_prediction=test_prediction[:, 1]

print(test_prediction)

[0.00128359 0.00164254 0.00012212 ... 0.00092187 0.00040369 0.00063722]


## write into csv
now we write the predictions into the csv file

In [51]:
sample_data = pd.read_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger1\iml-fall-2024-challenge-1\sample_submission.csv")

sample_data['Y'] = test_prediction
sample_data

sample_data.to_csv(r"D:\Users\DELL\OneDrive - Institute of Business Administration\IBA\sem5\machine learning\ipynb notebooks\challenger1\iml-fall-2024-challenge-1\cat1.csv", index=False)
sample_data

Unnamed: 0,RecordId,Y
0,300001,0.001284
1,300002,0.001643
2,300003,0.000122
3,300004,0.000745
4,300005,0.000702
...,...,...
105477,405478,0.000144
105478,405479,0.219081
105479,405480,0.000922
105480,405481,0.000404


In [52]:
model