In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Make sure this is the correct path:
data_path = '/content/drive/My Drive/Data Set/DT-Credit.csv'
df = pd.read_csv(data_path)
print("Dataset loaded. Shape:", df.shape)

Mounted at /content/drive
Dataset loaded. Shape: (400, 11)


In [None]:
display(df.head())

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,No,No,Yes,South,333
1,106.025,6645,483,3,82,15,Yes,Yes,Yes,West,903
2,104.593,7075,514,4,71,11,No,No,No,West,580
3,148.924,9504,681,3,36,11,Yes,No,No,West,964
4,55.882,4897,357,2,68,16,No,No,Yes,South,331


In [None]:
print('\nMissing Values per column:')
print(df.isnull().sum())


Missing Values per column:
Income       0
Limit        0
Rating       0
Cards        0
Age          0
Education    0
Own          0
Student      0
Married      0
Region       0
Balance      0
dtype: int64


In [None]:
label_columns = ['Own', 'Student', 'Married', 'Region']
label_encoders = {}
for col in label_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
print('\nPreprocessed, encoded data:')
display(df.head())


Preprocessed, encoded data:


Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,0,0,1,1,333
1,106.025,6645,483,3,82,15,1,1,1,2,903
2,104.593,7075,514,4,71,11,0,0,0,2,580
3,148.924,9504,681,3,36,11,1,0,0,2,964
4,55.882,4897,357,2,68,16,0,0,1,1,331


In [None]:
features = df.columns.drop('Balance')
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features].values)

In [None]:
print('\nAfter Scaling:')
display(df.head())


After Scaling:


Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,-0.861583,-0.489999,-0.465539,-0.69913,-1.257674,-0.78493,-1.035635,-0.333333,0.795395,-0.010581,333
1,1.727437,0.828261,0.828703,0.031032,1.528451,0.496588,0.965592,3.0,0.795395,1.400189,903
2,1.686756,1.014787,1.029311,0.761194,0.889964,-0.78493,-1.035635,-0.333333,-1.257237,1.400189,580
3,2.946152,2.06844,2.110003,0.031032,-1.141586,-0.78493,0.965592,-0.333333,-1.257237,1.400189,964
4,0.302928,0.070012,0.013331,-0.69913,0.715831,0.816968,-1.035635,-0.333333,0.795395,-0.010581,331


In [None]:
n = len(df)
train_end = int(0.7 * n)
valid_end = int(0.85 * n)

train_df = df.iloc[:train_end].copy()
valid_df = df.iloc[train_end:valid_end].copy()
test_df  = df.iloc[valid_end:].copy()

print(f"\nRows → Train: {train_df.shape[0]}, Valid: {valid_df.shape[0]}, Test: {test_df.shape[0]}")
print("\nShow train, valid, test samples:")
print("Train sample:")
display(train_df.head())
print("Valid sample:")
display(valid_df.head())
print("Test sample:")
display(test_df.head())


Rows → Train: 280, Valid: 60, Test: 60

Show train, valid, test samples:
Train sample:


Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,-0.861583,-0.489999,-0.465539,-0.69913,-1.257674,-0.78493,-1.035635,-0.333333,0.795395,-0.010581,333
1,1.727437,0.828261,0.828703,0.031032,1.528451,0.496588,0.965592,3.0,0.795395,1.400189,903
2,1.686756,1.014787,1.029311,0.761194,0.889964,-0.78493,-1.035635,-0.333333,-1.257237,1.400189,580
3,2.946152,2.06844,2.110003,0.031032,-1.141586,-0.78493,0.965592,-0.333333,-1.257237,1.400189,964
4,0.302928,0.070012,0.013331,-0.69913,0.715831,0.816968,-1.035635,-0.333333,0.795395,-0.010581,331


Valid sample:


Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
280,0.232445,0.253068,0.142755,0.031032,-1.19963,-0.46455,0.965592,-0.333333,-1.257237,-1.421351,541
281,-0.257865,-1.25085,-1.11266,0.031032,-1.315718,-0.144171,0.965592,-0.333333,-1.257237,-1.421351,0
282,0.520313,1.459412,1.462882,-0.69913,-0.328966,1.137347,0.965592,-0.333333,0.795395,-0.010581,1298
283,0.133753,0.72025,0.841646,0.031032,1.122141,1.137347,0.965592,-0.333333,0.795395,-0.010581,890
284,-0.866697,-1.166263,-1.216199,-0.69913,0.657787,-2.386828,-1.035635,-0.333333,0.795395,-0.010581,0


Test sample:


Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
340,-0.495022,-0.402809,-0.349057,0.761194,-1.19963,-1.746069,0.965592,-0.333333,0.795395,-1.421351,320
341,-0.908657,-0.352924,-0.361999,-0.69913,1.238229,1.137347,0.965592,-0.333333,0.795395,-0.010581,426
342,-0.869992,-0.77239,-0.614377,1.491355,-1.083541,-1.425689,-1.035635,-0.333333,0.795395,-1.421351,204
343,-0.979651,-0.429269,-0.484952,-0.69913,-0.677231,1.137347,0.965592,-0.333333,0.795395,-0.010581,410
344,0.085202,0.201014,0.298064,2.951679,-0.967453,-1.105309,-1.035635,-0.333333,0.795395,1.400189,633


In [None]:
X_train, y_train = train_df.drop('Balance', axis=1), train_df['Balance']
X_valid, y_valid = valid_df.drop('Balance', axis=1), valid_df['Balance']
X_test,  y_test  = test_df.drop('Balance', axis=1),  test_df['Balance']

ZeroR Classifier (Baseline)

In [None]:
from collections import Counter

def zeror_predict(y):
    # Predict the most common value in y_train
    majority = Counter(y_train).most_common(1)[0][0]
    return np.full_like(y, fill_value=majority)

# Convert balance to classification (e.g., > median = 1, else 0)
median_balance = y_train.median()
y_train_c = (y_train > median_balance).astype(int)
y_valid_c = (y_valid > median_balance).astype(int)
y_test_c  = (y_test  > median_balance).astype(int)

majority_class = y_train_c.mode()[0]
print(f"\n[ZeroR] Majority class in train: {majority_class}")

# Predict & evaluate
y_pred_valid = np.full_like(y_valid_c, fill_value=majority_class)
y_pred_test  = np.full_like(y_test_c, fill_value=majority_class)

print("\n== ZeroR Classifier (Validation) ==")
print("Accuracy:", accuracy_score(y_valid_c, y_pred_valid))
print("\n== ZeroR Classifier (Test) ==")
print("Accuracy:", accuracy_score(y_test_c, y_pred_test))


[ZeroR] Majority class in train: 0

== ZeroR Classifier (Validation) ==
Accuracy: 0.5

== ZeroR Classifier (Test) ==
Accuracy: 0.43333333333333335


In [None]:
#OneR Classifier Implementation
def oner_classifier(X, y):
    best_feature, best_rules, min_error = None, None, np.inf
    for feature in X.columns:
        rules = {}
        total_error = 0
        values = X[feature]
        for val in np.unique(values):
            mask = (values == val)
            target = y[mask]
            if len(target) == 0: continue
            common = target.mode()[0]
            rules[val] = common
            error = np.sum(target != common)
            total_error += error
        if total_error < min_error:
            min_error = total_error
            best_feature = feature
            best_rules = rules
    return best_feature, best_rules

# Train
best_feature, best_rules = oner_classifier(X_train, y_train_c)
print(f"\n[OneR] Best feature: {best_feature}")
print(f"Rules (showing first 10): {dict(list(best_rules.items())[:10])}")

# Predict function
def oner_predict(X, best_feature, best_rules):
    vals = X[best_feature]
    return vals.map(best_rules).fillna(0).astype(int)

# Validate
y_pred_valid = oner_predict(X_valid, best_feature, best_rules)
print("\n== OneR Classifier (Validation) ==")
print("Accuracy:", accuracy_score(y_valid_c, y_pred_valid))
print(classification_report(y_valid_c, y_pred_valid))

y_pred_test = oner_predict(X_test, best_feature, best_rules)
print("\n== OneR Classifier (Test) ==")
print("Accuracy:", accuracy_score(y_test_c, y_pred_test))
print(classification_report(y_test_c, y_pred_test))


[OneR] Best feature: Income
Rules (showing first 10): {np.float64(-0.9904743340269222): np.int64(0), np.float64(-0.9902186535906826): np.int64(0), np.float64(-0.9890822960962846): np.int64(1), np.float64(-0.9862414023602893): np.int64(0), np.float64(-0.9838266426846934): np.int64(1), np.float64(-0.9827186941276552): np.int64(0), np.float64(-0.9824914226287758): np.int64(0), np.float64(-0.9799062093290202): np.int64(0), np.float64(-0.9794516663312608): np.int64(0), np.float64(-0.9780028105259033): np.int64(1)}

== OneR Classifier (Validation) ==
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      1.00      0.67        30
           1       0.00      0.00      0.00        30

    accuracy                           0.50        60
   macro avg       0.25      0.50      0.33        60
weighted avg       0.25      0.50      0.33        60


== OneR Classifier (Test) ==
Accuracy: 0.4166666666666667
              precision    recall  f1-score   su

In [None]:
#K-Nearest-Neighbors Classifier
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train_c)
y_pred_valid = knn.predict(X_valid)
y_pred_test  = knn.predict(X_test)
print(f"\n== KNN (k={k}) Classifier ==")
print("Validation Accuracy:", accuracy_score(y_valid_c, y_pred_valid))
print("Test Accuracy:", accuracy_score(y_test_c, y_pred_test))
print(classification_report(y_test_c, y_pred_test))


== KNN (k=5) Classifier ==
Validation Accuracy: 0.7666666666666667
Test Accuracy: 0.8166666666666667
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        26
           1       0.87      0.79      0.83        34

    accuracy                           0.82        60
   macro avg       0.81      0.82      0.82        60
weighted avg       0.82      0.82      0.82        60



In [None]:
# Naive Bayes Classifier
# Use Gaussian (continuous)
nb = GaussianNB()
nb.fit(X_train, y_train_c)
y_pred_valid = nb.predict(X_valid)
y_pred_test  = nb.predict(X_test)
print("\n== Naive Bayes Classifier ==")
print("Validation Accuracy:", accuracy_score(y_valid_c, y_pred_valid))
print("Test Accuracy:", accuracy_score(y_test_c, y_pred_test))
print(classification_report(y_test_c, y_pred_test))


== Naive Bayes Classifier ==
Validation Accuracy: 0.8333333333333334
Test Accuracy: 0.8666666666666667
              precision    recall  f1-score   support

           0       0.80      0.92      0.86        26
           1       0.93      0.82      0.88        34

    accuracy                           0.87        60
   macro avg       0.87      0.87      0.87        60
weighted avg       0.88      0.87      0.87        60



In [None]:
# Support Vector Machine (SVM) - Classification
svc = SVC(kernel='rbf', C=1, gamma='scale')
svc.fit(X_train, y_train_c)
y_pred_valid = svc.predict(X_valid)
y_pred_test  = svc.predict(X_test)
print("\n== SVM (RBF Kernel) ==")
print("Validation Accuracy:", accuracy_score(y_valid_c, y_pred_valid))
print("Test Accuracy:", accuracy_score(y_test_c, y_pred_test))
print(classification_report(y_test_c, y_pred_test))


== SVM (RBF Kernel) ==
Validation Accuracy: 0.9
Test Accuracy: 0.8666666666666667
              precision    recall  f1-score   support

           0       0.88      0.81      0.84        26
           1       0.86      0.91      0.89        34

    accuracy                           0.87        60
   macro avg       0.87      0.86      0.86        60
weighted avg       0.87      0.87      0.87        60



In [None]:
# Support Vector Regression (SVR)
svr = SVR(kernel='rbf', C=1.0, gamma='scale')
svr.fit(X_train, y_train)
y_pred_valid = svr.predict(X_valid)
y_pred_test = svr.predict(X_test)
mse_valid = mean_squared_error(y_valid, y_pred_valid)
mse_test = mean_squared_error(y_test, y_pred_test)
print("\n== SVR (regression on Balance) ==")
print("Validation MSE:", mse_valid)
print("Testing MSE:", mse_test)
print("Predictions (first 10 on test):", y_pred_test[:10])


== SVR (regression on Balance) ==
Validation MSE: 218753.60379317115
Testing MSE: 182201.1802035854
Predictions (first 10 on test): [441.84181373 428.6530066  437.91421052 428.61313166 445.12228506
 421.74217553 440.59517122 451.73117889 414.82510846 440.88510498]
