<a href="https://colab.research.google.com/github/varshagayke/ML_algorithms/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Load dataset
data = pd.read_csv("/content/credit (1).csv")

# Handle missing values
data = data.dropna()

# Drop unnecessary column
data = data.drop(["phone"], axis=1)

# Label Encoding categorical columns
cat_cols = [
    "checking_balance", "credit_history", "purpose",
    "savings_balance", "employment_duration",
    "other_credit", "housing", "job"
]

for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Target & predictors
target = "default"
predictors = [col for col in data.columns if col != target]

# Train-test split
train, test = train_test_split(
    data, test_size=0.3, random_state=42
)


In [13]:
# -----------------------
# Decision Tree Model
# -----------------------
model = DecisionTreeClassifier(criterion="entropy", random_state=42)
model.fit(train[predictors], train[target])


In [14]:
# Test prediction
pred_test = model.predict(test[predictors])
print("Test Accuracy:", np.mean(pred_test == test[target]))
print(confusion_matrix(test[target], pred_test))


Test Accuracy: 0.7
[[167  42]
 [ 48  43]]


In [15]:
# Train prediction
pred_train = model.predict(train[predictors])
print("Train Accuracy:", np.mean(pred_train == train[target]))
print(confusion_matrix(train[target], pred_train))


Train Accuracy: 1.0
[[491   0]
 [  0 209]]


In [16]:
# -----------------------
# GridSearchCV
# -----------------------
param_grid = {
    'min_samples_leaf': [1, 5, 10, 20],
    'max_depth': [2, 4, 6, 8, 10],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(criterion="entropy", random_state=42),
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

grid_search.fit(train[predictors], train[target])

best_dt = grid_search.best_estimator_

print("Best Grid Params:", grid_search.best_params_)
print("Grid Test Accuracy:",
      accuracy_score(test[target], best_dt.predict(test[predictors])))

Best Grid Params: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 20}
Grid Test Accuracy: 0.7366666666666667


In [10]:
# -----------------------
# RandomizedSearchCV
# -----------------------
param_dist = {
    'min_samples_leaf': list(range(1, 50)),
    'max_depth': list(range(2, 20)),
    'max_features': ['sqrt']
}

random_search = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(criterion="entropy", random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    random_state=42
)

random_search.fit(train[predictors], train[target])

best_random_dt = random_search.best_estimator_

print("Best Random Params:", random_search.best_params_)
print("Random Test Accuracy:",
      accuracy_score(test[target], best_random_dt.predict(test[predictors])))

Best Random Params: {'min_samples_leaf': 7, 'max_features': 'sqrt', 'max_depth': 10}
Random Test Accuracy: 0.71
