Importing the necessary Python libraries

In [60]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier

# Code Explanation

1. **Load the Dataset**  
   - The training dataset is loaded from `train.csv` and stored in `train_data`.
   - The test dataset is also loaded from `test.csv` and stored in `test_data`.
   - The `ID` column is separately loaded from the test dataset to create `test_ids`, which will likely be used for referencing test predictions later.

2. **Analyze the Dataset**  
   - Several columns that are not essential for modeling, including `ID`, `Customer_ID`, `Month`, `Name`, `Profession`, `Number`, and `Loan_Type`, are removed from the `train_data` DataFrame. This simplifies the dataset by retaining only the most relevant features for the predictive model.
   - The `train_data.head()` function displays the first few rows of the modified training dataset, giving a quick preview of the remaining columns.

3. **Categorical Columns**  
   - Certain columns, such as `Loan_Type`, `Credit_Mix`, `Payment_of_Min_Amount`, `Payment_Behaviour`, and `Credit_Score`, contain categorical data.
   - The `Credit_Score` column, which indicates the creditworthiness of customers, is identified as the target variable for the predictive model.


In [61]:
# Load your dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
test_ids = pd.read_csv("test.csv")["ID"]
# Analyze your dataset
# ID, Customer_ID, Month, Name, Age, Profession, Number are columns to be dropped
train_data = train_data.drop(columns=["ID", "Customer_ID", "Month", "Name", "Profession", "Number", "Loan_Type"])

print(train_data.head())

# Loan_Type, Credit_Mix, Payment_of_Min_Amount, Payment_Behaviour , and Credit_Score are categorical columns
# Credit_Score is the target column

  train_data = pd.read_csv("train.csv")


  Age Income_Annual  Base_Salary_PerMonth  Total_Bank_Accounts  \
0  51     101583.48                   NaN                    5   
1  23     101926.95             8635.9125                    4   
2  49     158871.12                   NaN                    0   
3  40      60379.28                   NaN                    5   
4  17      50050.83             4085.9025                    9   

   Total_Credit_Cards  Rate_Of_Interest Total_Current_Loans  \
0                   7                10                  4_   
1                   4                 9                   1   
2                   4                 8                   1   
3                   6                18                   3   
4                  10                20                   5   

   Delay_from_due_date Total_Delayed_Payments Credit_Limit  ...  Credit_Mix  \
0                    8                      8         2.89  ...    Standard   
1                   13                      9        10.26  ...   

# Handling Missing Values

1. **Fill Missing Values with Median**  
   - The `fillna()` function is used to replace any missing values in the `train_data` DataFrame with the median of each column.
   - The `train_data.median(numeric_only=True)` calculation returns the median values of the numerical columns in `train_data`. This approach minimizes the impact of outliers, making it a robust choice for imputation.
   - `inplace=True` modifies `train_data` directly, updating it without needing to reassign it to a new variable.

2. **Preview Data After Imputation**  
   - The `print(train_data.head())` statement displays the first few rows of `train_data` to verify that missing values have been replaced.


In [62]:
# Fill missing values with median in train data
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
print(train_data.head())

  Age Income_Annual  Base_Salary_PerMonth  Total_Bank_Accounts  \
0  51     101583.48           3086.683333                    5   
1  23     101926.95           8635.912500                    4   
2  49     158871.12           3086.683333                    0   
3  40      60379.28           3086.683333                    5   
4  17      50050.83           4085.902500                    9   

   Total_Credit_Cards  Rate_Of_Interest Total_Current_Loans  \
0                   7                10                  4_   
1                   4                 9                   1   
2                   4                 8                   1   
3                   6                18                   3   
4                  10                20                   5   

   Delay_from_due_date Total_Delayed_Payments Credit_Limit  ...  Credit_Mix  \
0                    8                      8         2.89  ...    Standard   
1                   13                      9        10.26  ...   

# Data Cleaning and Type Conversion

1. **Removing Underscores and Converting Columns to Numeric**  
   - A `for` loop iterates over each column in the specified list of columns that are expected to contain numeric data but may have underscore characters (e.g., `"_"`) in the values, which can interfere with numeric conversion.
   - For each column:
     - In `train_data`, the column values are first converted to strings and any underscores are removed using `str.replace("_", "", regex=False)`.
     - After removing underscores, `pd.to_numeric()` converts the cleaned string values to numeric values.
     - The `errors="coerce"` parameter ensures that any non-numeric values that remain after removing underscores are replaced with `NaN`, allowing for consistent data handling.
   - The same cleaning and conversion process is applied to each corresponding column in `test_data`.

2. **Preview of Additional Columns to Consider**  
   - The comment block below the code lists additional columns that may also require cleaning or conversion, such as `Income_Annual`, `Base_Salary_PerMonth`, and `Current_Debt_Outstanding`. These columns may need further processing to ensure they are in the correct format for analysis and modeling.


In [63]:
# Convert relevant columns to numeric after removing any underscores.This was done as I saw a value in Total_Current_Loans with an underscore.
for col in ["Total_Current_Loans", "Current_Debt_Outstanding", "Income_Annual", "Credit_Limit", "Age", "Total_Credit_Cards", "Total_Bank_Accounts", "Delay_from_due_date"]:
    train_data[col] = pd.to_numeric(
        train_data[col].astype(str).str.replace("_", "", regex=False), errors="coerce"
    )
    test_data[col] = pd.to_numeric(
        test_data[col].astype(str).str.replace("_", "", regex=False), errors="coerce"
    )
train_data
# Income_Annual, Base_Salary_PerMonth,Current_Debt_Outstanding,Ratio_Credit_Utilization, Per_Month_EMI, Monthly_Investment

# 'Total_Delayed_Payments', 'Credit_Mix',
#        'Credit_History_Age', 'Payment_of_Min_Amount', 'Monthly_Investment',
#        'Payment_Behaviour', 'Monthly_Balance'],
#       dtype='object')

Unnamed: 0,Age,Income_Annual,Base_Salary_PerMonth,Total_Bank_Accounts,Total_Credit_Cards,Rate_Of_Interest,Total_Current_Loans,Delay_from_due_date,Total_Delayed_Payments,Credit_Limit,...,Credit_Mix,Current_Debt_Outstanding,Ratio_Credit_Utilization,Credit_History_Age,Payment_of_Min_Amount,Per_Month_EMI,Monthly_Investment,Payment_Behaviour,Monthly_Balance,Credit_Score
0,51,101583.480,3086.683333,5,7,10,4,8,8,2.89,...,Standard,50.93,34.462154,24 Years and 1 Months,No,190.811017,630.0157894388726,Low_spent_Large_value_payments,314.0021934422197,Standard
1,23,101926.950,8635.912500,4,4,9,1,13,9,10.26,...,_,1058.00,39.693812,20 Years and 5 Months,No,70.587681,662.8039273360225,Low_spent_Medium_value_payments,410.1996419555151,Standard
2,49,158871.120,3086.683333,0,4,8,1,8,2,1.17,...,Good,576.48,39.367225,19 Years and 0 Months,No,86.905860,746.8059854204569,Low_spent_Medium_value_payments,742.5141542054829,Standard
3,40,60379.280,3086.683333,5,6,18,3,15,12,6.83,...,Standard,725.39,29.061701,17 Years and 1 Months,NM,90.906385,166.41865803064803,High_spent_Medium_value_payments,473.13562343490486,Standard
4,17,50050.830,4085.902500,9,10,20,5,28,,15.45,...,Bad,3419.10,30.386321,4 Years and 6 Months,Yes,190.445060,56.789441169542684,High_spent_Large_value_payments,401.3557486786916,Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,55,114597.040,3086.683333,7,6,4,4,32,8,10.54,...,Good,926.18,26.436313,31 Years and 9 Months,No,225.923762,327.61966834569836,High_spent_Medium_value_payments,633.131903,Poor
79996,28,8227.855,656.654583,6,8,31,-100,61,20,17.60,...,_,2695.38,24.127401,11 Years and 8 Months,NM,7352.000000,49.54415830254037,Low_spent_Medium_value_payments,268.108435,Poor
79997,46,35032.660,2853.388333,10,6,33,7,48,18,2.52,...,Bad,1789.00,25.086176,11 Years and 5 Months,Yes,150.500097,106.73567925309915,Low_spent_Small_value_payments,318.103057,Poor
79998,42,129680.280,10643.690000,8,3,5,2,27,18,16.65,...,_,240.27,33.944094,20 Years and 5 Months,NM,114.165609,567.1798727916067,High_spent_Small_value_payments,643.023518,Standard


# Converting `Credit_History_Age` to Numeric Format

1. **Extracting Numerical Part from `Credit_History_Age`**  
   - The `Credit_History_Age` column contains values in a text format such as `"1 Years and 2 Months"`. To use this data numerically, we extract the numeric portion representing the years.
   - `str.extract("(\d+)")` uses a regular expression to capture only the numeric part (the number of years) from each entry, ignoring the rest of the text. `(\d+)` matches one or more digits.

2. **Converting Extracted Data to Float**  
   - After extraction, `.astype(float)` converts the resulting values to a float data type, ensuring the column is in numeric format and can be used in calculations or models.
   - This process is applied to both `train_data` and `test_data`, ensuring consistency across both datasets.

3. **Previewing the Converted Column**  
   - The final line displays the transformed `Credit_History_Age` column from `train_data`, allowing verification of the conversion.


In [64]:
#convert credit_history_age to numeric
# Format : 1 Years and 2 Months
train_data["Credit_History_Age"] = train_data["Credit_History_Age"].str.extract("(\d+)").astype(float)
test_data["Credit_History_Age"] = test_data["Credit_History_Age"].str.extract("(\d+)").astype(float)
train_data["Credit_History_Age"]


0        24.0
1        20.0
2        19.0
3        17.0
4         4.0
         ... 
79995    31.0
79996    11.0
79997    11.0
79998    20.0
79999    17.0
Name: Credit_History_Age, Length: 80000, dtype: float64

# Handling Infinity Values and Encoding the Target Variable

1. **Replacing Infinity Values**  
   - `train_data.replace([np.inf, -np.inf], np.nan, inplace=True)` searches for any infinity (`np.inf`) or negative infinity (`-np.inf`) values in `train_data` and replaces them with `NaN`. This ensures that infinity values, which may disrupt model training, are handled appropriately.

2. **Filling Missing Values with Median**  
   - After replacing infinity values with `NaN`, `train_data.fillna(train_data.median(numeric_only=True), inplace=True)` fills any remaining `NaN` values (including those from the previous step) with the median of each numeric column. This step ensures that the dataset remains complete and without missing values.

3. **Label Encoding the Target Variable**  
   - `LabelEncoder()` is used to convert the target column `Credit_Score` into numerical format. This is especially useful if `Credit_Score` is categorical (e.g., "Good", "Average", "Bad").
   - `label_encoder.fit_transform(train_data["Credit_Score"])` transforms the categories in `Credit_Score` to numerical labels, which is necessary for most machine learning models.
   - The modified `Credit_Score` column in `train_data` now contains integer values representing each category.


In [65]:
# Replace infinity values in train data
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
train_data.fillna(train_data.median(numeric_only=True), inplace=True)

# Label encode the target variable in train data
label_encoder = LabelEncoder()
train_data["Credit_Score"] = label_encoder.fit_transform(train_data["Credit_Score"])

# Preparing Training Features and Labels

1. **Separating Features (X_train)**  
   - `X_train` is created by dropping the target column, `Credit_Score`, from `train_data`. This DataFrame now contains only the feature columns that will be used as inputs for the model.

2. **Separating Labels (y_train)**  
   - `y_train` is created by selecting only the `Credit_Score` column from `train_data`. This Series represents the target variable, containing the labels that the model will learn to predict.

Together, `X_train` and `y_train` form the feature set and label set for model training.


In [66]:
# Prepare training features and labels
X_train = train_data.drop(columns="Credit_Score")
y_train = train_data["Credit_Score"]

# Feature Engineering in `train_data`

1. **Creating `Debt_Income_Ratio`**  
   - This new feature calculates the ratio of `Current_Debt_Outstanding` to `Income_Annual`. It provides insight into the debt burden relative to income, which can be an important indicator of credit risk.

2. **Creating `Income_Credit_Limit_Ratio`**  
   - This feature is the ratio of `Income_Annual` to `Credit_Limit`. It helps assess the income capacity against the available credit limit, which can indicate how likely a customer might be to max out their credit.

3. **Creating `Debt_Credit_Limit_Ratio`**  
   - This feature is calculated as the ratio of `Current_Debt_Outstanding` to `Credit_Limit`. It measures the proportion of debt relative to the available credit limit, which can indicate the extent to which a customer relies on credit.

These new features enhance the dataset by providing additional, potentially predictive variables that may improve model performance.

In [67]:
# Feature engineering in train data
train_data["Debt_Income_Ratio"] = (
    train_data["Current_Debt_Outstanding"] / train_data["Income_Annual"]
)
train_data["Income_Credit_Limit_Ratio"] = (
    train_data["Income_Annual"] / train_data["Credit_Limit"]
)
train_data["Debt_Credit_Limit_Ratio"] = (
    train_data["Current_Debt_Outstanding"] / train_data["Credit_Limit"]
)

# Cleaning and Converting Monthly Balance and Monthly Investment Columns

1. **Inspecting Data Columns**  
   - The code includes commented-out lines to print the `Monthly_Balance` and `Monthly_Investment` columns and identify object-type columns in `X_train`. These checks can help understand the initial data format and identify any issues before conversion.

2. **Converting `Monthly_Balance` to Numeric**  
   - `X_train["Monthly_Balance"]` is converted to numeric by first converting it to a string, removing any underscores (`_`) that may be present, and then using `pd.to_numeric` for conversion.
   - `errors="coerce"` replaces any non-numeric values with `NaN`, ensuring the column is consistently numeric.

3. **Converting `Monthly_Investment` to Numeric**  
   - Similarly, `X_train["Monthly_Investment"]` is converted by removing underscores and coercing any invalid values to `NaN`.

4. **Filling Missing Values with Median**  
   - Any `NaN` values in `X_train` resulting from the conversion process are filled with the median values of the numeric columns to maintain data completeness.

5. **Applying the Same Cleaning Steps to `test_data`**  
   - The `test_data` DataFrame undergoes the same cleaning and conversion process for `Monthly_Balance` and `Monthly_Investment`, ensuring both datasets are treated consistently.

6. **Printing the Converted Columns**  
   - Finally, the code prints `Monthly_Investment` and `Monthly_Balance` from `X_train` to verify the transformations.


In [68]:
# print Monthly_Balance column
# print(X_train["Monthly_Investment"])
# print(X_train.select_dtypes(include=["object"]).columns)
# Replace all str values in Monthly_Balance, Monthly_Investment

X_train["Monthly_Balance"] = pd.to_numeric(
    X_train["Monthly_Balance"].astype(str).str.replace("_", "", regex=False), errors="coerce"
)

X_train["Monthly_Investment"] = pd.to_numeric(
    X_train["Monthly_Investment"].astype(str).str.replace("_", "", regex=False), errors="coerce"
)

# Fill missing values with median for Monthly_Balance and Monthly_Investment
X_train.fillna(X_train.median(numeric_only=True), inplace=True)

# Same for test data
test_data["Monthly_Balance"] = pd.to_numeric(
    test_data["Monthly_Balance"].astype(str).str.replace("_", "", regex=False), errors="coerce"
)

test_data["Monthly_Investment"] = pd.to_numeric(
    test_data["Monthly_Investment"].astype(str).str.replace("_", "", regex=False), errors="coerce"
)

test_data.fillna(test_data.median(numeric_only=True), inplace=True)

# print Monthly_Balance column
print(X_train["Monthly_Investment"], X_train["Monthly_Balance"])

0        630.015789
1        662.803927
2        746.805985
3        166.418658
4         56.789441
            ...    
79995    327.619668
79996     49.544158
79997    106.735679
79998    567.179873
79999    219.890035
Name: Monthly_Investment, Length: 80000, dtype: float64 0        314.002193
1        410.199642
2        742.514154
3        473.135623
4        401.355749
            ...    
79995    633.131903
79996    268.108435
79997    318.103057
79998    643.023518
79999    858.281177
Name: Monthly_Balance, Length: 80000, dtype: float64


# Data Preprocessing for Training

1. **Replacing Placeholder Values with `NaN`**  
   - `X_train.replace("-", np.nan, inplace=True)` and `X_train.replace("NM", np.nan, inplace=True)` replace any instances of `"-"` and `"NM"` in `X_train` with `NaN`. These values likely indicate missing data, so replacing them with `NaN` allows for easier handling of missing values.

2. **Identifying Categorical and Numeric Features**  
   - The list `categorical_features` is defined with known categorical columns (`Credit_Mix`, `Payment_of_Min_Amount`, `Payment_Behaviour`, `Total_Delayed_Payments`), which are converted to string type to ensure consistent processing as categorical variables.
   - `numeric_features` is identified by selecting columns of types `int64` and `float64`, and `categorical_features` is updated by selecting columns of type `object`. This dynamic selection helps to distinguish features that need different preprocessing steps.

3. **Handling Missing Values for Numeric Features**  
   - `numerical_pipeline` is created to handle missing values and standardize numeric data:
     - `SimpleImputer(strategy="median")` fills missing numeric values with the median, ensuring that outliers do not skew the imputed values.
     - `StandardScaler()` standardizes numeric values by scaling them to have a mean of 0 and standard deviation of 1, which improves model performance.

4. **Handling Missing Values and Encoding for Categorical Features**  
   - `categorical_transformer` defines the preprocessing steps for categorical features:
     - `SimpleImputer(strategy="most_frequent")` fills missing categorical values with the most common category in each column.
     - `OneHotEncoder(handle_unknown="ignore")` converts categorical variables into binary (dummy) variables, creating new columns for each unique category while ignoring unseen categories in new data.

5. **Combining Preprocessing Steps**  
   - `preprocessor` uses `ColumnTransformer` to apply the defined pipelines:
     - The `numerical_pipeline` is applied to columns in `numeric_features`.
     - The `categorical_transformer` is applied to columns in `categorical_features`.
   - This combined preprocessor ensures that each feature type receives appropriate preprocessing before model training.

In [69]:
# Step 1: Identify numeric and categorical features


X_train.replace("-", np.nan, inplace=True)
X_train.replace("NM", np.nan, inplace=True)

categorical_features = [
    "Credit_Mix",
    "Payment_of_Min_Amount",
    "Payment_Behaviour",
    "Total_Delayed_Payments"
]
X_train[categorical_features] = X_train[categorical_features].astype(str)

numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

# Step 2: Handle Missing Values
# Use SimpleImputer to fill in missing values. Strategies: 'mean' for numerical, 'most_frequent' for categorical.

numerical_pipeline = Pipeline(
    [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# Define the preprocessing steps for numeric features
# numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])

# Define the preprocessing steps for categorical features
# Using OneHotEncoder to convert categorical variables into binary (dummy) variables.
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore"),
        ),  #! Can change this step for different encoding methods
    ]
)

# Step 3: Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Inspecting Data Columns

1. **Printing All Columns**  
   - `print(X_train.columns)` prints the names of all columns in the `X_train` DataFrame. This provides an overview of the dataset's structure, allowing you to check which features are available for model training.

2. **Printing Numerical Columns**  
   - `print(X_train.select_dtypes(include=["int64", "float64"]).columns)` prints the names of all columns in `X_train` that have numerical data types (`int64` and `float64`). This helps identify which columns are numeric and should be treated as such during preprocessing.

3. **Printing Categorical Columns**  
   - `print(X_train.select_dtypes(include=["object"]).columns)` prints the names of all columns in `X_train` with an object data type (typically categorical variables in pandas). This helps identify which columns are categorical and will need encoding during preprocessing.

In [70]:
#Print the columns
print(X_train.columns)
# Print Numerical columns
print(X_train.select_dtypes(include=["int64", "float64"]).columns)
# Print Categorical columns
print(X_train.select_dtypes(include=["object"]).columns)

Index(['Age', 'Income_Annual', 'Base_Salary_PerMonth', 'Total_Bank_Accounts',
       'Total_Credit_Cards', 'Rate_Of_Interest', 'Total_Current_Loans',
       'Delay_from_due_date', 'Total_Delayed_Payments', 'Credit_Limit',
       'Total_Credit_Enquiries', 'Credit_Mix', 'Current_Debt_Outstanding',
       'Ratio_Credit_Utilization', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Per_Month_EMI', 'Monthly_Investment',
       'Payment_Behaviour', 'Monthly_Balance'],
      dtype='object')
Index(['Age', 'Income_Annual', 'Base_Salary_PerMonth', 'Total_Bank_Accounts',
       'Total_Credit_Cards', 'Rate_Of_Interest', 'Total_Current_Loans',
       'Delay_from_due_date', 'Credit_Limit', 'Total_Credit_Enquiries',
       'Current_Debt_Outstanding', 'Ratio_Credit_Utilization',
       'Credit_History_Age', 'Per_Month_EMI', 'Monthly_Investment',
       'Monthly_Balance'],
      dtype='object')
Index(['Total_Delayed_Payments', 'Credit_Mix', 'Payment_of_Min_Amount',
       'Payment_Behaviour'],
  

# Feature Engineering in Test Data

1. **Creating Debt-to-Income Ratio**  
   - `test_data["Debt_Income_Ratio"]` is created by dividing `Current_Debt_Outstanding` by `Income_Annual`. This new feature provides insight into how much of the individual's annual income is allocated to repaying current debt.

2. **Creating Income-to-Credit Limit Ratio**  
   - `test_data["Income_Credit_Limit_Ratio"]` is created by dividing `Income_Annual` by `Credit_Limit`. This ratio measures how the individual's income compares to their total credit limit, which can indicate financial leverage.

3. **Creating Debt-to-Credit Limit Ratio**  
   - `test_data["Debt_Credit_Limit_Ratio"]` is created by dividing `Current_Debt_Outstanding` by `Credit_Limit`. This feature shows how much of the credit limit is being used relative to outstanding debt, which can help evaluate credit utilization.

In [71]:
# Feature engineering in test data
test_data["Debt_Income_Ratio"] = (
    test_data["Current_Debt_Outstanding"] / test_data["Income_Annual"]
)
test_data["Income_Credit_Limit_Ratio"] = (
    test_data["Income_Annual"] / test_data["Credit_Limit"]
)
test_data["Debt_Credit_Limit_Ratio"] = (
    test_data["Current_Debt_Outstanding"] / test_data["Credit_Limit"]
)

# Data Type Check and Handling for Mixed Data Types

1. **Check for Mixed Data Types in Each Column**  
   - The for loop iterates through each column in `X_train` to check for columns with mixed data types. It uses `dropna()` to remove `NaN` values before checking the types of the remaining values.
   - `unique_types = set(type(x) for x in X_train[col].dropna())` creates a set of unique types found in each column. If the set has more than one type, it means the column contains mixed data types.
   - If mixed types are found, `print(f"Column '{col}' has mixed types: {unique_types}")` will output the column name and the different types found in that column.

2. **Handling Non-Numeric Values in "Monthly_Balance"**  
   - The commented-out lines attempt to convert the `Monthly_Balance` column to numeric values using `pd.to_numeric(X_train["Monthly_Balance"], errors="coerce")`. Any non-numeric values will be converted to `NaN` due to the `errors="coerce"` option.
   - The following lines are intended to fill any missing values (`NaN`) in the `Monthly_Balance` column with the median value of the column using `X_train["Monthly_Balance"].fillna(X_train["Monthly_Balance"].median(), inplace=True)`.

3. **Uncommented Lines**  
   - The code that has been commented out suggests the same approach for filling missing values in both the `X_train` and `test_data` DataFrame's `Monthly_Balance` columns.
   - These lines would ensure that any non-numeric values in the `Monthly_Balance` column are converted to `NaN` and then replaced with the median value, improving the consistency and completeness of the data.

In [72]:
# Check for mixed data types in each column
for col in X_train.columns:
    unique_types = set(type(x) for x in X_train[col].dropna())
    if len(unique_types) > 1:
        print(f"Column '{col}' has mixed types: {unique_types}")

# # Convert all values to NaN whenever str is encountered for Monthly_Balance
# X_train["Monthly_Balance"] = pd.to_numeric(X_train["Monthly_Balance"], errors="coerce")

# # Fill missing values in 'Monthly_Balance' with the median value
# X_train["Monthly_Balance"].fillna(X_train["Monthly_Balance"].median(), inplace=True)


# X_train["Monthly_Balance"] = pd.to_numeric(X_train["Monthly_Balance"], errors="coerce")
# # test_data["Monthly_Balance"] = pd.to_numeric(
# #     test_data["Monthly_Balance"], errors="coerce"
# # )

# # Fill missing values in 'Monthly_Balance' with the median value
# X_train["Monthly_Balance"].fillna(X_train["Monthly_Balance"].median(), inplace=True)
# # test_data["Monthly_Balance"].fillna(test_data["Monthly_Balance"].median(), inplace=True)

# Print Numerical Columns in X_train

- `print(X_train.select_dtypes(include=["int64", "float64"]).columns)` prints the names of all columns in the `X_train` DataFrame that have numerical data types (`int64` and `float64`).
- This helps identify which columns contain numeric data, which are typically the features that will be used for model training or scaling.

In [73]:
print(X_train.select_dtypes(include=["int64", "float64"]).columns)

Index(['Age', 'Income_Annual', 'Base_Salary_PerMonth', 'Total_Bank_Accounts',
       'Total_Credit_Cards', 'Rate_Of_Interest', 'Total_Current_Loans',
       'Delay_from_due_date', 'Credit_Limit', 'Total_Credit_Enquiries',
       'Current_Debt_Outstanding', 'Ratio_Credit_Utilization',
       'Credit_History_Age', 'Per_Month_EMI', 'Monthly_Investment',
       'Monthly_Balance'],
      dtype='object')


# Step 4: Apply Preprocessing

1. **Apply Transformations to the Training Data**  
   - `X_preprocessed = preprocessor.fit_transform(X_train)` applies the preprocessing pipeline (`preprocessor`) to the training data (`X_train`), which includes imputation, scaling, and encoding of features.
   - The `fit_transform()` method first fits the preprocessing steps to the data and then transforms it, producing the preprocessed training data in a sparse matrix format.

2. **Convert Transformed Data to DataFrame and Assign Column Names**  
   - `X_train_df = pd.DataFrame.sparse.from_spmatrix(X_preprocessed)` converts the sparse matrix `X_preprocessed` to a pandas DataFrame (`X_train_df`), which is more suitable for further analysis or model training.
   - `X_train_df.columns = preprocessor.get_feature_names_out()` assigns the transformed feature names to the DataFrame columns. `get_feature_names_out()` retrieves the names of all the features after preprocessing (e.g., one-hot encoded columns).

3. **Apply Transformations to the Test Data**  
   - `test_data_preprocessed = preprocessor.transform(test_data)` applies the same transformations (fit on `X_train`) to the test data (`test_data`). Note that `transform()` is used here, which applies the already fitted transformations without refitting the model.
   - `test_data_df = pd.DataFrame.sparse.from_spmatrix(test_data_preprocessed)` converts the transformed test data to a DataFrame format.
   - `test_data_df.columns = preprocessor.get_feature_names_out()` assigns the same feature names to the test data columns.

# Step 5: Train-Test Split for the Preprocessed Data

4. **Train-Test Split**  
   - `X_train_1, X_val_1, y_train_1, y_val = train_test_split(X_train_df, y_train, test_size=0.2, random_state=42, stratify=y_train)` splits the preprocessed training data (`X_train_df`) and corresponding labels (`y_train`) into training and validation sets.
   - The `test_size=0.2` parameter ensures 20% of the data is used for validation and 80% for training.
   - `stratify=y_train` ensures that the class distribution in the target variable (`y_train`) is preserved in both the training and validation sets.

5. **Verify the Transformations**  
   - `print("Transformed Training Data Shape:", X_train_1.shape)` prints the shape (number of rows and columns) of the transformed training data.
   - `print("Transformed Validation Data Shape:", X_val_1.shape)` prints the shape of the validation data.
   - `print("Transformed Test Data Shape:", test_data_df.shape)` prints the shape of the transformed test data.

In [74]:
# Step 4: Apply preprocessing

# Apply transformations to the training data
X_preprocessed = preprocessor.fit_transform(X_train)

# Convert to DataFrame and assign column names
X_train_df = pd.DataFrame.sparse.from_spmatrix(X_preprocessed)
X_train_df.columns = preprocessor.get_feature_names_out()

# Apply transformations to the test data and convert to DataFrame with column names
test_data_preprocessed = preprocessor.transform(test_data)
test_data_df = pd.DataFrame.sparse.from_spmatrix(test_data_preprocessed)
test_data_df.columns = preprocessor.get_feature_names_out()

# Step 5: Train-Test Split for the preprocessed data
X_train_1, X_val_1, y_train_1, y_val = train_test_split(
    X_train_df, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Verify the transformations
print("Transformed Training Data Shape:", X_train_1.shape)
print("Transformed Validation Data Shape:", X_val_1.shape)
print("Transformed Test Data Shape:", test_data_df.shape)

Transformed Training Data Shape: (64000, 646)
Transformed Validation Data Shape: (16000, 646)
Transformed Test Data Shape: (20000, 646)


# Model Initialization and Configuration

1. **Create a Dictionary to Store Models**  
   - `models = {}` initializes an empty dictionary where different models will be stored by their names as keys.

2. **XGBoost Classifier**  
   - `model = XGBClassifier(...)` creates an instance of the XGBoost classifier with the following parameters:
     - `learning_rate=0.17031088174537234`: Sets the learning rate for the model, controlling the step size at each iteration while moving toward a minimum.
     - `max_depth=9`: Defines the maximum depth of a tree in the model.
     - `n_estimators=177`: The number of boosting rounds or trees in the model.
     - `random_state=42`: Sets a seed for random number generation to ensure reproducibility of the results.
     - `eval_metric="mlogloss"`: Specifies the evaluation metric used during training; here it uses the multi-class logarithmic loss.
   
   - `models["XGBoost"] = model`: Adds the created XGBoost model to the `models` dictionary, using `"XGBoost"` as the key.

3. **Commented Out Models**  
   - Several models are commented out, indicating they were either not used for the current run or are potential alternatives:
     - **Random Forest** (`RandomForestClassifier`): A popular ensemble method based on decision trees.
     - **K Nearest Neighbors (KNN)** (`KNeighborsClassifier`): A non-parametric classifier based on the closest data points.
     - **Logistic Regression** (`LogisticRegression`): A statistical method used for binary classification.
     - **Decision Tree Classifier** (`DecisionTreeClassifier`): A tree-based model that splits data into decision nodes.
     - **Gaussian Naive Bayes** (`GaussianNB`): A probabilistic classifier based on Bayes' theorem assuming Gaussian distribution for the data.
     - **AdaBoost** (`AdaBoostClassifier`): An ensemble learning technique that combines multiple weak classifiers into one strong classifier.
     
   - Each of these commented-out models is typically added to the `models` dictionary with its respective name as the key.

4. **Base Estimator for AdaBoost**  
   - `base_estimator = DecisionTreeClassifier(max_depth=6)` sets a base estimator (a decision tree) with a maximum depth of 6, used in the AdaBoost classifier.
   - `model = AdaBoostClassifier(...)`: Creates an AdaBoost model using the base estimator, and the model would have been added to the dictionary had it been uncommented.

In [87]:
models = {}
# model = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42, min_samples_split =2,
# min_samples_leaf=1
# )
# models["Random forest"] = model
model = XGBClassifier(
    learning_rate=0.17,
    max_depth=9,
    n_estimators=177,
    random_state=42,
    eval_metric="mlogloss",
)
# models["XGBoost"] = model
# model = CatBoostClassifier(
#     iterations=1000,  # Number of boosting iterations
#     learning_rate=0.2,  # Learning rate
#     depth=10,  # Depth of trees
#     random_seed=42,  # Random state for reproducibility
#     verbose=100,  # Output training logs every 100 iterations
#     l2_leaf_reg=7,  # L2 regularization coefficient
# )
# models["CatBoost"] = model
# model = KNeighborsClassifier(n_neighbors=5)
# models["K nearest neighbours"] = model
# model = LogisticRegression(random_state=42, max_iter=500)
# models["Logistic regression"] = model
# model = DecisionTreeClassifier(max_depth=6, random_state=42)
# models["Decision tree classifier"] = model
# model = GaussianNB()
# models["Gaussian"] = model
# base_estimator = DecisionTreeClassifier(max_depth=6)
# model = AdaBoostClassifier(estimator=base_estimator, n_estimators=300, random_state=42)
# models["Adaboost"] = model

# Displaying Training Labels and Features

1. **Display `y_train`**
   - `y_train` is the target variable (credit score) for the training data. This variable will be used to train models to predict credit scores.

2. **Print the Column Names of `X_train_1`**
   - `print(X_train_1.columns)` prints the column names of the preprocessed training features (`X_train_1`) to confirm that all expected columns are present after preprocessing.

3. **Print `y_train`**
   - `print(y_train)` outputs the `y_train` series to verify that it contains the expected labels (credit scores).

4. **Check Shape of `y_train`**
   - The comment `# y_train should have 1 column of credit score` is a reminder that `y_train` should contain only a single column representing credit scores, as it is a classification target for the models.

In [76]:
y_train
print(X_train_1.columns)
print(y_train)
# y_train should have 1 column of credit score


Index(['num__Age', 'num__Income_Annual', 'num__Base_Salary_PerMonth',
       'num__Total_Bank_Accounts', 'num__Total_Credit_Cards',
       'num__Rate_Of_Interest', 'num__Total_Current_Loans',
       'num__Delay_from_due_date', 'num__Credit_Limit',
       'num__Total_Credit_Enquiries',
       ...
       'cat__Payment_of_Min_Amount_No', 'cat__Payment_of_Min_Amount_Yes',
       'cat__Payment_of_Min_Amount_nan', 'cat__Payment_Behaviour_!@9#%8',
       'cat__Payment_Behaviour_High_spent_Large_value_payments',
       'cat__Payment_Behaviour_High_spent_Medium_value_payments',
       'cat__Payment_Behaviour_High_spent_Small_value_payments',
       'cat__Payment_Behaviour_Low_spent_Large_value_payments',
       'cat__Payment_Behaviour_Low_spent_Medium_value_payments',
       'cat__Payment_Behaviour_Low_spent_Small_value_payments'],
      dtype='object', length=646)
0        2
1        2
2        2
3        2
4        1
        ..
79995    1
79996    1
79997    1
79998    2
79999    2
Name: Cred

# Hyperparameter Optimization for XGBoost Classifier Using Optuna

1. **Import Necessary Libraries**
   - `optuna`: For conducting hyperparameter optimization.
   - `XGBClassifier` from `xgboost`: The machine learning model used for classification.
   - `cross_val_score` from `sklearn.model_selection`: Used for performing cross-validation to evaluate model performance.

2. **Define Objective Function for Optuna**
   - The `objective` function defines the process Optuna will use to test and evaluate different parameter combinations.
   - `params` dictionary specifies the search space for hyperparameters:
     - `learning_rate`: A float between 0.01 and 0.2.
     - `max_depth`: An integer between 3 and 10, controlling the depth of each tree.
     - `n_estimators`: An integer between 100 and 500, determining the number of trees.
     - `subsample`: A float between 0.5 and 1.0, which defines the fraction of samples used for training each tree.
     - `colsample_bytree`: A float between 0.5 and 1.0, which specifies the fraction of features used by each tree.

3. **Initialize the XGBoost Classifier with Trial Parameters**
   - `model = XGBClassifier(**params, random_state=42, eval_metric="mlogloss")`: Creates an instance of the XGBoost model with parameters generated by the current Optuna trial.

4. **Cross-Validation to Evaluate Model**
   - `accuracy = cross_val_score(model, X_train_1, y_train_1, cv=5, scoring="accuracy").mean()` performs 5-fold cross-validation on the model, returning the mean accuracy score.

5. **Run Bayesian Optimization with Optuna**
   - `study = optuna.create_study(direction="maximize")`: Initializes the optimization process with the goal of maximizing accuracy.
   - `study.optimize(objective, n_trials=50)`: Runs the optimization for 50 trials.

6. **Display Results**
   - After optimization, print the best parameters and best accuracy:
     - `print("Best parameters found: ", study.best_params)`
     - `print("Best accuracy: ", study.best_value)`

   The commented lines provide logs from sample trials showing trial results and the parameters that yielded the best accuracy

In [84]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for RandomizedSearch
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "max_depth": [3, 5, 10, 15, 20],
    "n_estimators": [300, 400, 500, 600, 700],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3, 0.4],
}

# Initialize the XGBoost model
xgb = XGBClassifier(random_state=42, eval_metric="mlogloss")

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=2,  # Number of parameter combinations to try
    scoring="accuracy",
    cv=5,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,  # Use all available cores
)

# Fit the random search to the data
random_search.fit(X_train_1, y_train_1)

# Print the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val_1)
accuracy = accuracy_score(y_val, y_pred)
print("Test Accuracy:", accuracy)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


KeyboardInterrupt: 

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score


# Objective function for Optuna
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }

    # Initialize the classifier with trial parameters
    model = XGBClassifier(**params, random_state=42, eval_metric="mlogloss")

    # 5-fold cross-validation
    accuracy = cross_val_score(model, X_train_1, y_train_1, cv=5, scoring="accuracy").mean()
    return accuracy


# Run Bayesian Optimization with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# [I 2024-11-03 16:53:08,619] Trial 0 finished with value: 0.7062031249999999 and parameters: {'learning_rate': 0.050893568127103876, 'max_depth': 4, 'n_estimators': 268, 'subsample': 0.6738283278380766, 'colsample_bytree': 0.6915738416826347}. Best is trial 0 with value: 0.7062031249999999.

# [I 2024-11-03 16:58:00,159] Trial 2 finished with value: 0.7632656250000001 and parameters: {'learning_rate': 0.17031088174537234, 'max_depth': 9, 'n_estimators': 177, 'subsample': 0.8174475788946926, 'colsample_bytree': 0.5291010066051618}. Best is trial 2 with value: 0.7632656250000001.

# Best parameters and score
print("Best parameters found: ", study.best_params)
print("Best accuracy: ", study.best_value)

# Model Training, Prediction, and Submission File Creation for Each Model in `models` Dictionary

1. **Loop Through Models**
   - `for key, value in models.items()`: Iterates through each model in the `models` dictionary, where `key` represents the model's name and `value` is the model instance.

2. **Try-Except Block for Error Handling**
   - Wraps the pipeline setup and predictions within a try-except block to catch and report any potential errors during the process.

3. **Define Model Pipeline**
   - `pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", value)])`: Creates a pipeline that combines data preprocessing (`preprocessor`) and the model (`classifier`), which allows both steps to be executed sequentially.

4. **Fit the Pipeline on Training Data**
   - `pipeline.fit(X_train, y_train)`: Trains the pipeline on the training data, applying the preprocessing steps followed by fitting the model.

5. **Make Predictions on Test Data**
   - `test_predictions = pipeline.predict(test_data)`: Generates predictions for the test dataset using the fitted pipeline.

6. **Convert Predictions Back to Original Labels**
   - `test_predictions_labels = label_encoder.inverse_transform(test_predictions)`: Converts encoded predictions back to the original label format using a `label_encoder`.

7. **Prepare the Submission File**
   - `submission = pd.DataFrame({"ID": test_ids, "Credit_Score": test_predictions_labels})`: Creates a DataFrame for submission with test IDs and predicted credit scores.
   - `submission.to_csv(f"submission_{key}.csv", index=False)`: Saves the DataFrame as a CSV file with the model's name included in the filename.

8. **Print Success Message**
   - `print(f"Submission file 'submission_{key}.csv' created successfully!")`: Confirms successful file creation.

9. **Handle Exceptions**
   - In case of an error, prints an error message with details of the exception: `print(f"Error : {e}")`.

In [90]:
for key, value in models.items():
    try:  
        # Define model pipeline
        pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", value)])

        # Fit the pipeline on training data
        pipeline.fit(X_train, y_train)
        
        # Make predictions on test data
        test_predictions = pipeline.predict(test_data)

        # Convert predictions back to original labels
        test_predictions_labels = label_encoder.inverse_transform(test_predictions)

        # test_predictions_encoded = label_encoder.transform(test_predictions)
        # Prepare the submission file
        submission = pd.DataFrame(
            {"ID": test_ids, "Credit_Score": test_predictions_labels}
        )
        submission.to_csv(f"submission_{key}.csv", index=False)

        print(f"Submission file 'submission_{key}.csv' created successfully!")
    except Exception as e:
        print(f"Error : {e}")