# Project Objective

This notebook aims to prepare and explore the `emi_prediction_dataset.csv` to support building a reliable model for predicting EMI (Equated Monthly Installment) eligibility.

Goals:
- Perform exploratory data analysis to understand distributions, missingness, and relationships between features and the target `emi_eligibility`.
- Clean and normalize raw fields (fix types, remove formatting issues), impute missing values, and encode categorical variables.
- Create robust feature engineering to capture affordability and repayment capacity (e.g. debt ratios, net income after expenses).
- Produce a reproducible preprocessing pipeline that can be reused for modeling and inference.

Deliverables from this notebook:
- Cleaned and typed dataset ready for modeling.
- A documented preprocessing pipeline (transformations, imputations, encodings).
- Summary EDA plots and tables describing key predictors and data quality issues.

Assumptions & Notes:
- The data contains inconsistent string/number formats (e.g. `monthly_salary`, `bank_balance`, `age`) and missing values; these will be corrected and imputed.
- Class balance for `emi_eligibility` will be checked and handled during modeling in downstream notebooks.

Next steps: finalize preprocessing, persist the cleaned dataset, and move to model training and evaluation notebooks.

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

In [0]:
emi_pred=pd.read_csv("files/emi_prediction_dataset.csv", low_memory=False)

In [0]:
emi_pred.head()

In [0]:
emi_pred.shape

In [0]:
emi_pred.info()

In [0]:
emi_pred.columns

In [0]:
col_x=['age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'existing_loans', 'current_emi_amount',
       'credit_score', 'bank_balance', 'emergency_fund', 'emi_scenario',
       'requested_amount', 'requested_tenure']

In [0]:
emi_pred["emi_eligibility"].value_counts()

In [0]:
emi_pred[emi_pred["emi_eligibility"]=="Not_Eligible"].head()

In [0]:
emi_pred[emi_pred["emi_eligibility"]=="Eligible"].head()

In [0]:
emi_pred.isna().sum()

In [0]:
emi_pred[emi_pred["education"].isna()].head()

In [0]:
emi_pred["education"].value_counts()

In [0]:
emi_pred[emi_pred["monthly_rent"].isna()].head()

In [0]:
emi_pred[emi_pred["credit_score"].isna()].head()

In [0]:
emi_pred[emi_pred["bank_balance"].isna()].head()

In [0]:
emi_pred[emi_pred["emergency_fund"].isna()].head()

In [0]:
emi_pred.dtypes

In [0]:
emi_pred[emi_pred["bank_balance"].isna()]

In [0]:
emi_pred[pd.notna(emi_pred["bank_balance"])==False]

In [0]:
def type_corrector(x):
    # Skip if x is NaN
    if pd.isna(x):
        return np.nan
    # Convert safely to string and handle decimals
    try:
        return int(float(str(x).split(".")[0]))
    except ValueError:
        return np.nan

In [0]:
# Montlhly salary should be in int or float but showing in object type. So converting it to int type. there are multiple ". ." observed in the data so removing that also.

emi_pred["monthly_salary"] = emi_pred["monthly_salary"].apply(type_corrector)

In [0]:
emi_pred["bank_balance"] = emi_pred["bank_balance"].apply(type_corrector)

In [0]:
emi_pred["age"] = emi_pred["age"].apply(type_corrector)

In [0]:
emi_pred.dtypes

In [0]:
x_data=emi_pred[col_x].copy()

In [0]:
emi_pred

In [0]:
# Formatting string columns to have consistent capitalization and removing leading/trailing spaces
x_data[x_data.select_dtypes(include=['object']).columns]=x_data[x_data.select_dtypes(include=['object']).columns].apply(lambda x: x.str.strip().str.title())

In [0]:
x_data.select_dtypes(include=['object']).columns

In [0]:
numeric_cols=x_data.select_dtypes(include=['int64', 'float64']).columns

In [0]:
x_data.nunique()

In [0]:
x_data.replace({"existing_loans":{"Yes":1, "No":0}}, inplace=True)

In [0]:
x_data.replace({"marital_status":{"Married":1, "Single":0}}, inplace=True)

In [0]:
x_data["gender"].value_counts()

In [0]:
x_data.replace({"gender":{"M":1, "F":0, "Male":1, "Female":0}}, inplace=True)

In [0]:
x_data["gender"].value_counts()

In [0]:
x_data["emi_scenario"].value_counts()

### Categorical VS Categorical Association Test Chi2 Test

In [0]:
pd.crosstab(x_data["gender"], x_data["emi_scenario"])

In [0]:
def chi_test(col: list, x_data=x_data):
    from scipy.stats import chi2_contingency
    data=pd.DataFrame()
    col1, col2 =[], []
    chi2, p, dof, expected = [], [], [], []
    col_vs_col = []
    for i in col:
        for j in col:
            if i != j:
                contingency_table = pd.crosstab(x_data[i], x_data[j])
                chi2_stat, p_value, dof_val, expected_val = chi2_contingency(contingency_table)
                col1.append(f"{i}")
                col2.append(f"{j}")
                col_vs_col.append( f"{i} VS {j}")
                chi2.append(chi2_stat)
                p.append(p_value)
                dof.append(dof_val)
                expected.append(expected_val)
    data=pd.DataFrame({"Col1": col1, "Col2": col2,"Columns":col_vs_col,
                       "Chi2_statistic":chi2,
                          "p_value":p,
                            "Degrees_of_freedom":dof,
                                "Expected_freq":expected})
    return data



In [0]:
def chi_test_optimized(cat_cols: list, x_data):
    from scipy.stats import chi2_contingency
    import pandas as pd
    import numpy as np
    from itertools import combinations

    # Store results
    results = []

    # Loop through unique pairs only (no A vs B and B vs A duplicates)
    for col1, col2 in combinations(cat_cols, 2):
        # Create contingency table
        contingency_table = pd.crosstab(x_data[col1], x_data[col2])

        # Skip if table is empty or invalid
        if contingency_table.shape[0] < 2 or contingency_table.shape[1] < 2:
            continue

        try:
            chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
        except ValueError:
            continue  # skip columns with NaN or invalid values

        # Append results
        results.append({
            "Col1": col1,
            "Col2": col2,
            "Columns": f"{col1} VS {col2}",
            "Chi2_statistic": chi2_stat,
            "p_value": p_value,
            "Degrees_of_freedom": dof,
            "Expected_freq": expected
        })

    # Convert to DataFrame
    chi_result = pd.DataFrame(results)

    # Optional: add significance and strength of relationship (Cramér’s V)
    chi_result["Significant"] = np.where(chi_result["p_value"] < 0.05, "Yes", "No")
    chi_result["Cramers_V"] = np.sqrt(
        chi_result["Chi2_statistic"] / 
        (len(x_data) * (np.minimum(
            chi_result["Degrees_of_freedom"], chi_result["Degrees_of_freedom"].max())))
    )

    # Sort by strongest relationships
    chi_result = chi_result.sort_values(by="Chi2_statistic", ascending=False).reset_index(drop=True)

    return chi_result


In [0]:
emi_pred.columns

In [0]:
emi_pred_new=x_data.copy()
emi_pred_new[['emi_eligibility', 'max_monthly_emi']]=emi_pred[['emi_eligibility', 'max_monthly_emi']]

In [0]:
emi_pred_new.head(1)

In [0]:
emi_pred["emi_scenario"].value_counts()

In [0]:
emi_pred.select_dtypes(include=['object']).head(2)

In [0]:
categorical_columns = [
    'gender', 'marital_status', 'education', 'employment_type', 'company_type', 'house_type','emi_eligibility','existing_loans','emi_scenario'
]

In [0]:
chi_result=chi_test(categorical_columns, x_data=emi_pred_new)

In [0]:
chi_result[chi_result["p_value"]<0.05]

In [0]:
chi_result=chi_test_optimized(categorical_columns, x_data=emi_pred_new)

In [0]:
chi_result[chi_result["p_value"]<0.05]

**Chi test Findings:** From the chi-square test results, we observe that several categorical features show significant associations with the target variable `emi_eligibility`. Notably, `employment_type`, `marital_status`, `house_type`, `education` and `existing_loans` have p-values less than 0.05, indicating a statistically significant relationship with EMI eligibility. This suggests that these features could be important predictors in our modeling efforts.

In [0]:
emi_pred_new.dtypes

In [0]:
numeric_cols

In [0]:
emi_pred_new[categorical_columns]

In [0]:
plt.figure(figsize=(15,8))
sns.heatmap(emi_pred_new.select_dtypes(["float64", "int64"]).corr(method='spearman'), annot=True, cmap='coolwarm');

In [0]:
anova_test_columns=[
    'age', 'monthly_salary', 'years_of_employment', 'monthly_rent',
    'family_size', 'dependents', 'school_fees', 'college_fees',
    'travel_expenses', 'groceries_utilities', 'other_monthly_expenses',
    'current_emi_amount', 'credit_score', 'bank_balance',
    'emergency_fund', 'requested_amount', 'requested_tenure', 'max_monthly_emi']

In [0]:
emi_pred_new[anova_test_columns]

#### Anova test against numeric columns vs emi_eligibility
- Anova test H0: There is no significant difference in the means of the numeric feature across different levels of `emi_eligibility`.
- Anova test H1: There is a significant difference in the means of the numeric feature across different levels of `emi_eligibility`.
- ALPHA = 0.05

In [0]:
from scipy.stats import f_oneway
emi_pred_new[anova_test_columns]

In [0]:
anova_test_columns

In [0]:
emi_pred_new["emi_eligibility"].unique()

In [0]:
check1=emi_pred_new[emi_pred_new["emi_eligibility"]=="Eligible"]["age"]
check2=emi_pred_new[emi_pred_new["emi_eligibility"]=="Not_Eligible"]["age"]
check3=emi_pred_new[emi_pred_new["emi_eligibility"]=="High_Risk"]["age"]


In [0]:
f_oneway(check1, check2, check3)

In [0]:
# ddfining anova test function

from scipy.stats import f_oneway

def anova_test(column: list, data=emi_pred_new, target='emi_eligibility'):
    """Performs ANOVA test for each column against the target categorical variable."""
    anova_results = []
    for col in column:
        if col != target:
            groups = []
            for level in data['emi_eligibility'].unique():
                group = data[data['emi_eligibility'] == level][col].dropna()
                groups.append(group)
            f_stat, p_value = f_oneway(*groups)
            anova_results.append({
                'Feature': col,
                'F-statistic': f_stat,
                'p-value': p_value
            })
    return pd.DataFrame(anova_results).apply(lambda row: 'Significant' if row['p-value'] < 0.05 else 'Not Significant', axis=1).to_frame(name='Significance').join(pd.DataFrame(anova_results))

In [0]:
anova_test_results=anova_test(anova_test_columns, data=emi_pred_new)

In [0]:
anova_test_results.sort_values(by="F-statistic", ascending=False)

- If p-value < ALPHA, we reject H0 and accept H1, indicating that the numeric feature has a significant effect on `emi_eligibility`.
- `age` columns is insignificant with `emi_eligibility` as p-value > 0.05
- All other numeric columns are significant with `emi_eligibility` as p-value < 0.05
- However, some columns are more significant than others based on the F-statistic value. Higher F-statistic values indicate a stronger effect of the numeric feature on `emi_eligibility`.
- columns like `max_monthly_emi`, `requested_amount`, `bank_balance`, `credit_score` have higher F-statistic values, suggesting they are more influential in determining EMI eligibility compared to columns like `age` and `dependents`.

In [0]:
emi_pred_new.columns

In [0]:
fixed_expenses=['school_fees', 'college_fees', 'current_emi_amount']

In [0]:
otherexpenses_col=['travel_expenses', 'groceries_utilities','other_monthly_expenses'] 

In [0]:
emi_pred_new["Total_Fixed_expenses"]=emi_pred_new[fixed_expenses].sum(axis=1)
emi_pred_new["Total_Other_expenses"]=emi_pred_new[otherexpenses_col].sum(axis=1)

In [0]:
plt.figure(figsize=(20,8))
sns.heatmap(emi_pred_new.select_dtypes(["float64", "int64"]).corr(method='spearman'), annot=True, cmap='coolwarm');

In [0]:
emi_pred_new['emi_eligibility'].value_counts()

In [0]:
emi_pred_new[emi_pred_new["emi_eligibility"]=="Not_Eligible"].describe()

In [0]:
emi_pred_new[emi_pred_new["emi_eligibility"]=="Eligible"].describe()

In [0]:
emi_pred_new[emi_pred_new["emi_eligibility"]=="High_Risk"].describe()

In [0]:
emi_pred_new.isna().sum()

In [0]:
target_cols=['emi_eligibility', 'max_monthly_emi']

In [0]:
emi_pred_new.select_dtypes(include=['object']).columns

In [0]:
from sklearn.preprocessing import OrdinalEncoder
encoding_columns=['education', 'employment_type', 'company_type', 'house_type',
       'emi_scenario']

encoder=OrdinalEncoder()
emi_pred_new[encoding_columns]=encoder.fit_transform(emi_pred_new[encoding_columns])

In [0]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
imputer.fit(emi_pred_new.drop(columns=target_cols, axis=1))



In [0]:
new_values=imputer.transform(emi_pred_new.drop(columns=target_cols, axis=1))

In [0]:
new_data=pd.DataFrame(new_values, columns=emi_pred_new.drop(columns=target_cols, axis=1).columns)

In [0]:
emi_pred_new

In [0]:
new_data[target_cols]=emi_pred_new[target_cols]
new_data

In [0]:
new_data["emi_eligibility"].value_counts()

In [0]:
new_data["emi_eligibility"].replace({"Not_Eligible":0, "Eligible":1, "High_Risk":2}, inplace=True)
new_data["emi_eligibility"].value_counts()


In [0]:
new_data["Savings"] = new_data["bank_balance"] + new_data["emergency_fund"]
new_data["Savings"]

* Dataset seperation for model training. Creating 2 Dataset
- One with which will include Total Fixed expenses and total Other expenses while exluding the columns which were used to create these features.
- other one will include the Initial features but exclude the newly enigneered features

In [0]:
plt.figure(figsize=(30,10))
sns.heatmap(new_data.corr(), annot=True);

In [0]:
new_data.columns

In [0]:
len(new_data.columns)

In [0]:
fixed_expenses

In [0]:
otherexpenses_col

In [0]:
final_columns_v1 = [
    'age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'existing_loans', 'current_emi_amount',
       'credit_score', 'bank_balance', 'emergency_fund', 'emi_scenario',
       'requested_amount', 'requested_tenure', 
     'emi_eligibility', 'max_monthly_emi'
]

In [0]:
final_columns_v2=[
    'age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents',  'existing_loans', 'credit_score', 'emi_scenario',
       'requested_amount', 'requested_tenure', 'Total_Fixed_expenses',
       'Total_Other_expenses', 'emi_eligibility', 'max_monthly_emi',
       'Savings'
]

In [0]:
final_data_v1=new_data[final_columns_v1]
final_data_v2=new_data[final_columns_v2]

#### Machine learning Training Started

In [0]:
def data_prepare(data, datatype="type1", target_type=None):
    final_columns_v1 = [
    'age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'existing_loans', 'current_emi_amount',
       'credit_score', 'bank_balance', 'emergency_fund', 'emi_scenario',
       'requested_amount', 'requested_tenure']
    final_columns_v2=[
    'age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents',  'existing_loans', 'credit_score', 'emi_scenario',
       'requested_amount', 'requested_tenure', 'Total_Fixed_expenses',
       'Total_Other_expenses','Savings']
    if datatype=="type1":
        if target_type==None:
            final_data=data[final_columns_v1]
        elif target_type=="regression":
            final_data=data[final_columns_v1]
            final_data['max_monthly_emi']=data["max_monthly_emi"]
        elif target_type=="classification":
            final_data=data[final_columns_v1]
            final_data['emi_eligibility']=data["emi_eligibility"]
        else:
            print("Please enter correct target type")
    elif datatype=="type2":
        if target_type==None:
            final_data=data[final_columns_v2]
        elif target_type=="regression":
            final_data=data[final_columns_v2]
            final_data['max_monthly_emi']=data["max_monthly_emi"]
        elif target_type=="classification":
            final_data=data[final_columns_v2]
            final_data['emi_eligibility']=data["emi_eligibility"]
    return final_data

In [0]:
from sklearn.preprocessing import RobustScaler
x_data_v1=final_data_v1.drop(target_cols, axis=1)
x_data_v2=final_data_v2.drop(target_cols, axis=1)
v1_scaler=RobustScaler()
v2_scaler=RobustScaler()
v1_scaled=v1_scaler.fit_transform(x_data_v1)
v2_scaled=v2_scaler.fit_transform(x_data_v2)



In [0]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#### Training Regressor Model

In [0]:
target_cols[0]

In [0]:
y_data_c = new_data[target_cols[0]]
y_data_r = new_data[target_cols[1]]

In [0]:
x_train, x_test, y_train, y_test = train_test_split(v1_scaled, y_data_r, test_size=0.2, random_state=42)

In [0]:
R1_type_1=RandomForestRegressor(n_estimators=400, n_jobs=-1)

In [0]:
R1_type_1.fit(x_train, y_train)
y_pred = R1_type_1.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [0]:
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")
print(f"RMSE: {mse**0.5}")

In [0]:
x_train, x_test, y_train, y_test = train_test_split(v2_scaled, y_data_r, test_size=0.2, random_state=42)
R1_type_2=RandomForestRegressor(n_estimators=400, n_jobs=-1)

In [0]:
R1_type_2.fit(x_train, y_train)
y_pred = R1_type_2.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [0]:
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")
print(f"RMSE: {mse**0.5}")

In [0]:
data_prepare(new_data,datatype="type2", target_type="regression")

In [0]:
target_cols

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
classfication=data_prepare(new_data,datatype="type1", target_type="classification")

x_train, x_test,y_train,y_test = train_test_split(classfication.drop(target_cols[0], axis=1), classfication[target_cols[0]], test_size=0.2, random_state=42)
R1_type_2_c=RandomForestClassifier(n_estimators=400, n_jobs=-1)
R1_type_2_c.fit(x_train, y_train)
y_pred = R1_type_2_c.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
classfication=data_prepare(new_data,datatype="type2", target_type="classification")

x_train, x_test,y_train,y_test = train_test_split(classfication.drop(target_cols[0], axis=1), classfication[target_cols[0]], test_size=0.2, random_state=42)
R1_type_2_c=RandomForestClassifier(n_estimators=400, n_jobs=-1)
R1_type_2_c.fit(x_train, y_train)
y_pred = R1_type_2_c.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")





In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
classfication=data_prepare(new_data,datatype="type2", target_type="classification")
classfication["max_monthly_emi"]=new_data['max_monthly_emi']
x_train, x_test,y_train,y_test = train_test_split(classfication.drop(target_cols[0], axis=1), classfication[target_cols[0]], test_size=0.2, random_state=42)
R1_type_2_c=RandomForestClassifier(n_estimators=300, n_jobs=-1)
R1_type_2_c.fit(x_train, y_train)
y_pred = R1_type_2_c.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

In [0]:
import mlflow
mlflow.set_registry_uri("databricks-uc")

In [0]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score, matthews_corrcoef, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import numpy as np
def logging(y_test, pred, model_name="model"):
    # --- 1️⃣ Core Metrics ---
    mlflow.log_metric("accuracy", accuracy_score(y_test, pred))
    mlflow.log_metric("f1_score_weighted", f1_score(y_test, pred, average='weighted'))
    mlflow.log_metric("recall_weighted", recall_score(y_test, pred, average='weighted'))
    mlflow.log_metric("precision_weighted", precision_score(y_test, pred, average='weighted'))

    balanced_acc = balanced_accuracy_score(y_test, pred)
    kappa = cohen_kappa_score(y_test, pred)
    mcc = matthews_corrcoef(y_test, pred)

    mlflow.log_metric("balanced_accuracy", balanced_acc)
    mlflow.log_metric("cohen_kappa", kappa)
    mlflow.log_metric("mcc", mcc)

    # --- 2️⃣ Confusion Matrix ---
    cm = confusion_matrix(y_test, pred, labels=np.unique(y_test))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))

    fig, ax = plt.subplots(figsize=(6, 6))
    disp.plot(cmap='Blues', ax=ax, colorbar=False)
    plt.title(f"Confusion Matrix - {model_name}")


    mlflow.log_figure(fig, f"{model_name}_confusion_matrix.png")
    plt.close(fig)

    # --- 3️⃣ Per-Class Metrics ---
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, pred, average=None, labels=np.unique(y_test)
    )

    for i, label in enumerate(np.unique(y_test)):
        mlflow.log_metric(f"precision_class_{label}", prec[i])
        mlflow.log_metric(f"recall_class_{label}", rec[i])
        mlflow.log_metric(f"f1_class_{label}", f1[i])


    print(f"✅ Logged metrics and confusion matrix for {model_name}")

In [0]:
target_cols

In [0]:

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
def data_split(cdata, rebalance = False):
    X=cdata.drop("emi_eligibility", axis=1)
    y=cdata["emi_eligibility"]
    if rebalance==True:
        sm = SMOTE(random_state=42)
        X, y = sm.fit_resample(X, y)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return x_train, x_test, y_train, y_test

In [0]:
classfication=data_prepare(new_data,datatype="type2", target_type="classification")
classfication["max_monthly_emi"]=new_data['max_monthly_emi']
x_train, x_test,y_train,y_test = train_test_split(classfication.drop(target_cols[0], axis=1), classfication[target_cols[0]], test_size=0.2, random_state=42)


expriment_name="/Workspace/Users/sushantkashikar1@gmail.com/mlflow/Inital_Model_test"

model_name="Random_Forest_Classifier"
run_name="Randomforest_classifier_test_run"
mlflow.set_experiment(expriment_name)
with mlflow.start_run(run_name=run_name) as run:
    mlflow.sklearn.autolog()
    mlflow.set_tag("model_name", model_name)
    mlflow.set_tag("run_name", run_name)
    mlflow.set_tag("model_type", "classification")
    estimators=400
    criterion="gini"
    depth=15

    rfc=RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=5, n_jobs=-1, class_weight="balanced")
    rfc.fit(x_train, y_train)
    pred=rfc.predict(x_test)
    acc=accuracy_score(y_test, pred)
    print(f"Classification details: \n {classification_report(y_test, pred)}")
    mlflow.log_param("n_estimators", estimators)
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("depth", depth)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1_score(y_test, pred, average='weighted'))
    mlflow.log_metric("recall", recall_score(y_test, pred, average='weighted'))
    mlflow.log_metric("precision", precision_score(y_test, pred, average='weighted'))
    cm = confusion_matrix(y_test, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))