In [None]:
#EX 1
#a)
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

RANDOM_STATE = 0
data = pd.read_csv('heart-disease.csv', delimiter=',')
data.head()

X, y = data.drop('target', axis=1), np.ravel(data['target'])

# Define models
knn = KNeighborsClassifier(n_neighbors=5)
nb = GaussianNB()

# Stratified 5-Fold Cross-Validation with Shuffling
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Evaluating models
knn_accuracies = cross_val_score(knn, X, y, cv=skf, scoring='accuracy')
nb_accuracies = cross_val_score(nb, X, y, cv=skf, scoring='accuracy')

# Boxplot

boxprops = dict(facecolor='lightblue')
medianprops = dict(linewidth=2, color='darkblue')

plt.boxplot([knn_accuracies, nb_accuracies],
           tick_labels=['kNN', 'Naive Bayes'],
           patch_artist=True,  # This allows fill color
           boxprops=boxprops,
           medianprops=medianprops,)
plt.ylabel('Accuracy')
plt.ylim(0.5, 0.9)  # You can change the lower (0.5) and upper (1.0) limits based on your data

plt.title('Comparison of Classifiers')
plt.show()



## Explanation:

Analyzing both boxplots, we can observe that the Naive Bayes classifier shows greater stability compared to kNN. This stability can be attributed to the assumptions of Naive Bayes; in this case, it is assumed that the data follows a Gaussian distribution and that all features are conditionally independent. Additionally, it follows a probabilistic model, which makes it less sensitive to variations in the dataset. As a result, it tends to produce consistent performance across different folds in cross-validation, contributing to its stability.

On the other hand, the kNN classifier is a non-parametric method, meaning it relies only on the instances it observes. Thus, its performance is highly dependent on the composition of the training data, which can vary for each fold, since it calculates distances between data points to make predictions. Because of this, kNN is more sensitive to small changes in the dataset, leading to greater variability in performance between different folds.

In [None]:
#b)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Evaluating models
knn_scaled_acc = cross_val_score(knn, X_scaled, y, cv=skf, scoring='accuracy')
nb_scaled_acc = cross_val_score(nb, X_scaled, y, cv=skf, scoring='accuracy')

print('kNN Scaled Accuracy:', knn_scaled_acc)
print('Naive Bayes Scaled Accuracy:', nb_scaled_acc)
# Boxplot
boxprops = dict(facecolor='lightblue')
medianprops = dict(linewidth=2, color='darkblue')

plt.boxplot([knn_scaled_acc, nb_scaled_acc],
            tick_labels=['kNN', 'Naive Bayes'],
            patch_artist=True,  # This allows fill color
            boxprops=boxprops,
            medianprops=medianprops,)
plt.ylabel('Accuracy')
plt.ylim(0.5, 0.9)  # You can change the lower (0.5) and upper (1.0) limits based on your data

plt.title('Comparison of Classifiers')
plt.show()


## Explanation:
When using the Min-Max scaler, the performance of the k-NN model improves substantially and becomes more stable, approaching the accuracy values of Naïve Bayes, which does not show significant variation. This improvement is due to the fact that kNN bases its algorithm on calculating distances between the features of each instance, making it sensitive to the magnitude of the features. For example, a feature that ranges from 0 to 100 will have a greater impact on the distance than a feature that ranges from 0 to 1, which leads to some features erroneously having more importance than others. This problem is solved by normalization that occurs during processing with Min-Max (usually [0, 1]), ensuring that each feature contributes equally to the distance measure. This leads to more accurate and consistent predictions across different folds, improving both performance and stability.
The insignificant variation observed in Naïve Bayes after feature normalization is due to the nature of the model, which is based on a Gaussian distribution of the features and probability calculations, so the magnitude of the features does not significantly affect its performance.

In [None]:
#c)
from scipy import stats

# Perform paired t-test
t_stat, p_value = stats.ttest_rel(knn_scaled_acc, nb_scaled_acc, alternative='greater')

print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

if p_value < 0.05: # alpha(significance level) = 0.05
    print("kNN is statistically superior to Naive Bayes (the hypothesis is not rejected), so the statement is true.")
else:
    print("There is no evidence to conclude that the kNN model is statistically superior (the hypothesis is rejected), so the statement is not true.")


In [None]:
#EX 2

import matplotlib.pyplot as plt
from sklearn import metrics, datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

list_number_neighbors = [1, 5, 10, 20, 30]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)


test_accuracies_uniform = []
train_accuracies_uniform = []
test_accuracies_distance = []
train_accuracies_distance = []

for nn in list_number_neighbors:
    test_accuracy_avg = 0
    train_accuracies_avg = 0
    # Uniform weights
    knn_uniform = KNeighborsClassifier(n_neighbors=nn, weights='uniform')
    knn_uniform.fit(X_train, y_train)
    train_accuracies_uniform.append(knn_uniform.score(X_train, y_train))
    test_accuracies_uniform.append(knn_uniform.score(X_test, y_test))
    
    # Distance weights
    knn_distance = KNeighborsClassifier(n_neighbors=nn, weights='distance')
    knn_distance.fit(X_train, y_train)
    train_accuracies_distance.append(knn_distance.score(X_train, y_train))
    test_accuracies_distance.append(knn_distance.score(X_test, y_test))

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(list_number_neighbors, train_accuracies_uniform, label='Train (Uniform)', marker='o')
plt.plot(list_number_neighbors, test_accuracies_uniform, label='Test (Uniform)', marker='o')
plt.grid()
plt.title('Accuracies for different k values and uniform weights')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(list_number_neighbors, train_accuracies_distance, label='Train (Distance)', marker='s')
plt.plot(list_number_neighbors, test_accuracies_distance, label='Test (Distance)', marker='s')
plt.grid()
plt.title('Distance Weights')

plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.legend()

plt.title('Accuracies for different k values and weighthed by distance')
plt.grid(True)
plt.show()


**2 b)** Initially, by increasing the number of neighbors k, we smooth the model and consequently reduce overfitting, increasing its generalization ability. This is because we consider more observations when making a decision, which makes the model less sensitive to noise and outliers in the data. Thus, the model starts to capture global trends in the data, which improves test data performance, although it may reduce training accuracy, as the model is no longer fitted to very specific details.

In the case of distance weighting, higher values of k do not change the model as much, maintaining high training accuracy, since the closest instances have a greater weight. This approach ensures that even with a high number of neighbors, the model continues to benefit from some sensitivity to local data, resulting in stable accuracy for both training and test data.


**3.** The heart-disease.csv dataset contains variables, such as 'sex', that are binary/categorical, so the assumption that all features in the model follow a Gaussian distribution is a limitation of the Naive Bayes model. Likewise, the assumption that all features are conditionally independent does not accurately reflect the characteristics of this dataset, since variables such as 'age', 'blood pressure', 'cholesterol levels', among others, may be correlated. Ignoring these dependencies can lead to erroneous probability estimates, causing inferior performance.