In [126]:
import numpy as np
import pandas as pd

In [127]:
from ucimlrepo import fetch_ucirepo

car_evaluation = fetch_ucirepo(id=19)  

X = car_evaluation.data.features 
y = car_evaluation.data.targets 

In [128]:
X_encoded = pd.get_dummies(X) # One hot encoding
y_encoded = pd.get_dummies(y) # One hot encoding

# print(X_encoded)
print(y_encoded)

# print(type(X))
# print(type(y))

# print(X_encoded.shape)
# print(y_encoded.shape)

      class_acc  class_good  class_unacc  class_vgood
0         False       False         True        False
1         False       False         True        False
2         False       False         True        False
3         False       False         True        False
4         False       False         True        False
...         ...         ...          ...          ...
1723      False        True        False        False
1724      False       False        False         True
1725      False       False         True        False
1726      False        True        False        False
1727      False       False        False         True

[1728 rows x 4 columns]


In [129]:
from sklearn.model_selection import train_test_split

# data split, 70% training and 30% temp (temp = validation + test)
X_train_encoded, X_temp_encoded, y_train_encoded, y_temp_encoded = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# 30% temp data into 15% validation and 15% test
X_val_encoded, X_test_encoded, y_val_encoded, y_test_encoded = train_test_split(X_temp_encoded, y_temp_encoded, test_size=0.5, random_state=42)

In [130]:
def calculate_metrics(y_true, y_pred):
    # Check if y_true is a pandas DataFrame and convert it to a NumPy array for compatibility
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.to_numpy()  # Convert y_true to a NumPy array

    # Check if y_pred is a pandas DataFrame and convert it to a NumPy array for compatibility
    if isinstance(y_pred, pd.DataFrame):
        y_pred = y_pred.to_numpy()  # Convert y_pred to a NumPy array

    # Check if y_true is already flattened (1D)
    if y_true.ndim > 1:
        y_true = y_true.flatten()  # Flatten if it has more than 1 dimension

    # Check if y_pred is already flattened (1D)
    if y_pred.ndim > 1:
        y_pred = y_pred.flatten()  # Flatten if it has more than 1 dimension

    # Find unique class names in y_true and determine the number of unique classes
    class_names = np.unique(y_true)  # Get unique class names from y_true
    unique_classes = class_names.size  # Count the number of unique classes

    # Initialize a confusion matrix with zeros, sized based on the number of unique classes
    confusion_matrix = np.zeros((unique_classes, unique_classes), dtype=int)

    # Map class names to indices for easy lookup
    class_name_to_index = {class_name: idx for idx, class_name in enumerate(class_names)}

    # Count occurrences of actual vs predicted labels
    for actual, predicted in zip(y_true, y_pred):
        # Ensure actual and predicted are scalar values, not arrays
        actual = actual.item() if isinstance(actual, np.ndarray) else actual
        predicted = predicted.item() if isinstance(predicted, np.ndarray) else predicted
        
        # Find the index for the actual and predicted class
        actual_index = class_name_to_index[actual]
        predicted_index = class_name_to_index[predicted]

        # Increment the appropriate cell in the confusion matrix
        confusion_matrix[actual_index, predicted_index] += 1

    # Print the confusion matrix row by row
    print("\nConfusion matrix:")
    for row in confusion_matrix:
        print(" ".join(map(str, row)))  # Print each row of the confusion matrix

    # --- --- --- --- --- ---
    
    # Accuracy Calculation
    
    # Sum of the diagonal elements (correct predictions)
    correct_predictions = np.trace(confusion_matrix)  # np.trace() gives the sum of diagonal elements

    # Total number of predictions (sum of all elements in the matrix)
    total_predictions = np.sum(confusion_matrix)

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    print(f"\nAccuracy: {accuracy}")

    # --- --- --- --- --- ---
        
    # Precision Calculation

    def calculate_precision(confusion_matrix, class_names):
        precision = {}
        
        # Iterate over each class to calculate its precision
        for i, class_name in enumerate(class_names):
            # True Positive (TP) is the value in the diagonal for that class
            true_positive = confusion_matrix[i, i]
            
            # False Positive (FP) is the sum of the column (excluding the diagonal)
            false_positive = np.sum(confusion_matrix[:, i]) - true_positive
            
            # Precision for the current class
            precision[class_name] = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0        
        
        return precision
    
    precision = calculate_precision(confusion_matrix, class_names)
    print("\nPrecision for each class:")
    for class_name, precision_value in precision.items():
        print(f"{class_name}: {precision_value:}")

    total_precision = sum(precision.values())
    print(f"\nMacro precision: {total_precision / unique_classes}")

    # --- --- --- --- --- ---
    
    # Recall Calculation
    
    def calculate_recall(confusion_matrix, class_names):
        recall = {}
        
        # Iterate over each class to calculate its recall
        for i, class_name in enumerate(class_names):
            # True Positive (TP) is the value in the diagonal for that class
            true_positive = confusion_matrix[i, i]
            
            # False Negative (FN) is the sum of the row (excluding the diagonal)
            false_negative = np.sum(confusion_matrix[i, :]) - true_positive
            
            # Recall for the current class
            recall[class_name] = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0        
        
        return recall
    
    recall = calculate_recall(confusion_matrix, class_names)
    print("\nRecall for each class:")
    for class_name, recall_value in recall.items():
        print(f"{class_name}: {recall_value:.4f}")

    total_recall = sum(recall.values())
    print(f"\nMacro recall: {total_recall / unique_classes}")

    # --- --- --- --- --- ---

    # F1 Score Calculation
    
    def calculate_f1_score(precision, recall):
        f1_scores = {}
    
        # Calculate F1 score for each class
        for class_name in precision.keys():
            p = precision[class_name]
            r = recall[class_name]
            
            # Calculate F1 score for the class, handling cases where p + r = 0
            f1_scores[class_name] = (2 * p * r) / (p + r) if (p + r) != 0 else 0
        
        return f1_scores
    
    f1_scores = calculate_f1_score(precision, recall)
    print("\nF1 Score for each class:")
    for class_name, f1_value in f1_scores.items():
        print(f"{class_name}: {f1_value:}")
        
    # Macro F1 Score Calculation
    total_f1 = sum(f1_scores.values())  # Sum of F1 scores for each class
    macro_f1 = total_f1 / len(f1_scores)  # Average F1 score across all classes
    
    print(f"\nMacro F1 score: {macro_f1:}")

In [131]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_encoded, y_train_encoded)

y_train_pred_knn = knn.predict(X_train_encoded)
y_val_pred_knn = knn.predict(X_val_encoded)
y_test_pred_knn = knn.predict(X_test_encoded)

print("------------------------------")
print("Training Set Metrics:")
calculate_metrics(y_train_encoded, y_train_pred_knn)

print("------------------------------")
print("Validation Set Metrics:")
calculate_metrics(y_val_encoded, y_val_pred_knn)

print("------------------------------")
print("Test Set Metrics:")
calculate_metrics(y_test_encoded, y_test_pred_knn)

------------------------------
Training Set Metrics:

Confusion matrix:
3575 52
106 1103

Accuracy: 0.967328370554177

Precision for each class:
False: 0.9712034773159468
True: 0.9549783549783549

Macro precision: 0.9630909161471508

Recall for each class:
False: 0.9857
True: 0.9123

Macro recall: 0.948993658671078

F1 Score for each class:
False: 0.9783798576902026
True: 0.9331641285956006

Macro F1 score: 0.9557719931429016
------------------------------
Validation Set Metrics:

Confusion matrix:
759 18
41 218

Accuracy: 0.943050193050193

Precision for each class:
False: 0.94875
True: 0.923728813559322

Macro precision: 0.936239406779661

Recall for each class:
False: 0.9768
True: 0.8417

Macro recall: 0.9092664092664093

F1 Score for each class:
False: 0.9625871908687381
True: 0.8808080808080808

Macro F1 score: 0.9216976358384095
------------------------------
Test Set Metrics:

Confusion matrix:
758 22
41 219

Accuracy: 0.9394230769230769

Precision for each class:
False: 0.94868

In [132]:
y_train_numeric = y_train_encoded.idxmax(axis=1).map({
    'class_acc': 0,
    'class_good': 1,
    'class_unacc': 2,
    'class_vgood': 3
})

y_val_numeric = y_val_encoded.idxmax(axis=1).map({
    'class_acc': 0,
    'class_good': 1,
    'class_unacc': 2,
    'class_vgood': 3
})

y_test_numeric = y_test_encoded.idxmax(axis=1).map({
    'class_acc': 0,
    'class_good': 1,
    'class_unacc': 2,
    'class_vgood': 3
})

In [133]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_encoded, y_train_numeric)

y_train_pred_nb = nb_classifier.predict(X_train_encoded)
y_val_pred_nb = nb_classifier.predict(X_val_encoded)
y_test_pred_nb = nb_classifier.predict(X_test_encoded)

print("------------------------------")
print("Training Set Metrics:")
calculate_metrics(y_train_numeric, y_train_pred_nb)

print("------------------------------")
print("Validation Set Metrics:")
calculate_metrics(y_val_numeric, y_val_pred_nb)

print("------------------------------")
print("Test Set Metrics:")
calculate_metrics(y_test_numeric, y_test_pred_nb)

------------------------------
Training Set Metrics:

Confusion matrix:
192 6 68 0
35 15 0 0
31 1 820 0
17 1 0 23

Accuracy: 0.8684863523573201

Precision for each class:
0: 0.6981818181818182
1: 0.6521739130434783
2: 0.9234234234234234
3: 1.0

Macro precision: 0.81844478866218

Recall for each class:
0: 0.7218
1: 0.3000
2: 0.9624
3: 0.5610

Macro recall: 0.636305358897071

F1 Score for each class:
0: 0.7097966728280962
1: 0.410958904109589
2: 0.942528735632184
3: 0.71875

Macro F1 score: 0.6955085781424672
------------------------------
Validation Set Metrics:

Confusion matrix:
38 2 17 0
8 2 0 0
4 0 174 0
9 0 0 5

Accuracy: 0.8455598455598455

Precision for each class:
0: 0.6440677966101694
1: 0.5
2: 0.9109947643979057
3: 1.0

Macro precision: 0.7637656402520188

Recall for each class:
0: 0.6667
1: 0.2000
2: 0.9775
3: 0.3571

Macro recall: 0.5503344034242911

F1 Score for each class:
0: 0.6551724137931034
1: 0.28571428571428575
2: 0.943089430894309
3: 0.5263157894736842

Macro F1 sco

In [134]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', random_state=42)  # 'linear', 'rbf'
svm_classifier.fit(X_train_encoded, y_train_numeric)

y_train_pred_svm = svm_classifier.predict(X_train_encoded)
y_val_pred_svm = svm_classifier.predict(X_val_encoded)
y_test_pred_svm = svm_classifier.predict(X_test_encoded)

print("------------------------------")
print("Training Set Metrics:")
calculate_metrics(y_train_numeric, y_train_pred_svm)

print("------------------------------")
print("Validation Set Metrics:")
calculate_metrics(y_val_numeric, y_val_pred_svm)

print("------------------------------")
print("Test Set Metrics:")
calculate_metrics(y_test_numeric, y_test_pred_svm)

------------------------------
Training Set Metrics:

Confusion matrix:
237 7 21 1
0 49 0 1
26 3 823 0
0 0 0 41

Accuracy: 0.9511993382961125

Precision for each class:
0: 0.9011406844106464
1: 0.8305084745762712
2: 0.9751184834123223
3: 0.9534883720930233

Macro precision: 0.9150640036230657

Recall for each class:
0: 0.8910
1: 0.9800
2: 0.9660
3: 1.0000

Macro recall: 0.9592349712308941

F1 Score for each class:
0: 0.8960302457466919
1: 0.8990825688073395
2: 0.9705188679245284
3: 0.9761904761904763

Macro F1 score: 0.935455539667259
------------------------------
Validation Set Metrics:

Confusion matrix:
45 3 8 1
0 8 0 2
5 0 173 0
0 0 0 14

Accuracy: 0.9266409266409267

Precision for each class:
0: 0.9
1: 0.7272727272727273
2: 0.9558011049723757
3: 0.8235294117647058

Macro precision: 0.8516508110024523

Recall for each class:
0: 0.7895
1: 0.8000
2: 0.9719
3: 1.0000

Macro recall: 0.8903459491425192

F1 Score for each class:
0: 0.8411214953271027
1: 0.761904761904762
2: 0.9637883008

In [135]:
from sklearn.dummy import DummyClassifier
from sklearn import datasets

zero_r_classifier = DummyClassifier(strategy="most_frequent")
zero_r_classifier.fit(X_train_encoded, y_train_numeric)

y_train_pred_zero_r = zero_r_classifier.predict(X_train_encoded)
y_val_pred_zero_r = zero_r_classifier.predict(X_val_encoded)
y_test_pred_zero_r = zero_r_classifier.predict(X_test_encoded)

print("------------------------------")
print("ZeroR Classifier - Training Set Metrics:")
calculate_metrics(y_train_numeric, y_train_pred_zero_r)

print("------------------------------")
print("ZeroR Classifier - Validation Set Metrics:")
calculate_metrics(y_val_numeric, y_val_pred_zero_r)

print("------------------------------")
print("ZeroR Classifier - Test Set Metrics:")
calculate_metrics(y_test_numeric, y_test_pred_zero_r)

------------------------------
ZeroR Classifier - Training Set Metrics:

Confusion matrix:
0 0 266 0
0 0 50 0
0 0 852 0
0 0 41 0

Accuracy: 0.7047146401985112

Precision for each class:
0: 0
1: 0
2: 0.7047146401985112
3: 0

Macro precision: 0.1761786600496278

Recall for each class:
0: 0.0000
1: 0.0000
2: 1.0000
3: 0.0000

Macro recall: 0.25

F1 Score for each class:
0: 0
1: 0
2: 0.826783114992722
3: 0

Macro F1 score: 0.2066957787481805
------------------------------
ZeroR Classifier - Validation Set Metrics:

Confusion matrix:
0 0 57 0
0 0 10 0
0 0 178 0
0 0 14 0

Accuracy: 0.6872586872586872

Precision for each class:
0: 0
1: 0
2: 0.6872586872586872
3: 0

Macro precision: 0.1718146718146718

Recall for each class:
0: 0.0000
1: 0.0000
2: 1.0000
3: 0.0000

Macro recall: 0.25

F1 Score for each class:
0: 0
1: 0
2: 0.8146453089244851
3: 0

Macro F1 score: 0.20366132723112126
------------------------------
ZeroR Classifier - Test Set Metrics:

Confusion matrix:
0 0 61 0
0 0 9 0
0 0 180 0

In [136]:
from sklearn.base import BaseEstimator

class OneRClassifier(BaseEstimator):
    def __init__(self):
        self.rules = {}

    def fit(self, X, y):
        X = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        self.rules = {}
        for feature_index in range(X.shape[1]):
            feature_values = X[:, feature_index]
            rule_accuracy = {}
            for value in np.unique(feature_values):
                predicted_class = y[feature_values == value].mode()[0]
                rule_accuracy[value] = (y[feature_values == value] == predicted_class).mean()
            best_value = max(rule_accuracy, key=rule_accuracy.get)
            self.rules[feature_index] = best_value
        return self
        
    def predict(self, X):
        X = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        predictions = []
        for row in X:
            row_predictions = [self.rules.get(i, None) for i in range(len(row))]
            most_frequent_class = max(set(row_predictions), key=row_predictions.count)  # majority vote for the row
            predictions.append(most_frequent_class)
        return np.array(predictions)

one_r_classifier = OneRClassifier()
one_r_classifier.fit(X_train_encoded, y_train_numeric)

y_train_pred_one_r = one_r_classifier.predict(X_train_encoded)
y_val_pred_one_r = one_r_classifier.predict(X_val_encoded)
y_test_pred_one_r = one_r_classifier.predict(X_test_encoded)

print("------------------------------")
print("OneR Classifier - Training Set Metrics:")
calculate_metrics(y_train_numeric, y_train_pred_one_r)

print("------------------------------")
print("OneR Classifier - Validation Set Metrics:")
calculate_metrics(y_val_numeric, y_val_pred_one_r)

print("------------------------------")
print("OneR Classifier - Test Set Metrics:")
calculate_metrics(y_test_numeric, y_test_pred_one_r)

------------------------------
OneR Classifier - Training Set Metrics:

Confusion matrix:
266 0 0 0
50 0 0 0
852 0 0 0
41 0 0 0

Accuracy: 0.22001654259718775

Precision for each class:
0: 0.22001654259718775
1: 0
2: 0
3: 0

Macro precision: 0.05500413564929694

Recall for each class:
0: 1.0000
1: 0.0000
2: 0.0000
3: 0.0000

Macro recall: 0.25

F1 Score for each class:
0: 0.3606779661016949
1: 0
2: 0
3: 0

Macro F1 score: 0.09016949152542372
------------------------------
OneR Classifier - Validation Set Metrics:

Confusion matrix:
57 0 0 0
10 0 0 0
178 0 0 0
14 0 0 0

Accuracy: 0.22007722007722008

Precision for each class:
0: 0.22007722007722008
1: 0
2: 0
3: 0

Macro precision: 0.05501930501930502

Recall for each class:
0: 1.0000
1: 0.0000
2: 0.0000
3: 0.0000

Macro recall: 0.25

F1 Score for each class:
0: 0.36075949367088606
1: 0
2: 0
3: 0

Macro F1 score: 0.09018987341772151
------------------------------
OneR Classifier - Test Set Metrics:

Confusion matrix:
61 0 0 0
9 0 0 0
180