In [1]:
# Initial imports
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Loading data
df_loans = pd.read_csv("train.csv")

In [3]:
# Define features set
X = df_loans.copy()
X.drop("Credit_History", axis=1, inplace=True)

In [4]:
# Define target vector
y = df_loans["Credit_History"].values.reshape(-1, 1)

In [5]:
df_loans["Credit_History"].value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [6]:
# Handle missing values in the target vector by using the mode of y_train
y_mode = pd.Series(y.flatten()).mode()[0]
y = pd.DataFrame(y).fillna(y_mode).values


In [7]:
# Oversample the minority class (Credit_History = 0) using RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)


In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=78)


In [9]:
# Select numeric and categorical features
numeric_features = [
    "ApplicantIncome",
    "CoapplicantIncome",
    "LoanAmount",
    "Loan_Amount_Term",
]

categorical_features = [
    "Gender",
    "Married",
    "Dependents",
    "Education",
    "Self_Employed",
    "Property_Area",
]

In [10]:
# Create the preprocessing pipelines for both numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


In [11]:
# Fit and transform the training data using the preprocessor
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Transform the test data using the preprocessor
X_test_preprocessed = preprocessor.transform(X_test)


In [12]:
# Create the decision tree classifier instance
model = DecisionTreeClassifier()

# Fit the model with the preprocessed training data and target labels
model.fit(X_train_preprocessed, y_train)

# Use the trained model to make predictions on the test data
y_pred = model.predict(X_test_preprocessed)

# Flatten the y_test for comparison
y_test_flat = y_test.flatten()


In [13]:
# Print the predicted credit history values
print("Predicted Credit History:")
print(y_pred.flatten())  # Flatten the predictions to a 1D array for easier printing


Predicted Credit History:
[0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1.
 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1.
 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1.
 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1.]


In [14]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test_flat, y_pred)

In [15]:
# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[119   2]
 [ 34 108]]


In [16]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test_flat, y_pred)

# Print the accuracy score
print("Accuracy Score:", accuracy)

Accuracy Score: 0.8631178707224335
