In [None]:
#EX1

from scipy.io.arff import loadarff
from sklearn.feature_selection import f_classif
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

diabetes_data = loadarff('diabetes.arff')
df = pd.DataFrame(diabetes_data[0])
df['Outcome'] = df['Outcome'].str.decode('utf-8')

df.head()


X = df.drop(columns=['Outcome'],axis=1)
y = df['Outcome']

f_score, p_values = f_classif(X,y)


best_score = max(f_score)
worst_score = min(f_score)

best_feature = X.columns.values[list(f_score).index(best_score)]
worst_feature = X.columns.values[list(f_score).index(worst_score)]
print('The best feature is', best_feature)
print('The best feature is', worst_feature)


plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plot_best = sns.kdeplot(df, x = best_feature ,label='Best_score', hue='Outcome')
plt.title("Best Discriminative Power Feature Class-Conditional PDF")

plt.subplot(1, 2, 2)
plot_worst = sns.kdeplot(df, x = worst_feature ,label='Worst_score', hue='Outcome')
plt.title("Worst Discriminative Power Feature Class-Conditional PDF")

plt.show()


In [None]:
#EX2

import matplotlib.pyplot as plt
from sklearn import metrics, datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

RANDOM_STATE = 1
MINIMUM_SAMPLE_SPLIT = [2, 5, 10, 20, 30, 50, 100]  

X = df.drop(columns=['Outcome'],axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)


test_accuracies = []
train_accuracies = []

for sample in MINIMUM_SAMPLE_SPLIT:
    test_accuracy_avg = 0
    train_accuracies_avg = 0

    for i in range(10):
        predictor = tree.DecisionTreeClassifier(random_state=RANDOM_STATE, min_samples_split=sample)
        predictor.fit(X_train, y_train)


        y_test_pred = predictor.predict(X_test) #test prediction
        
        y_train_pred = predictor.predict(X_train)

        test_accuracy_avg += metrics.accuracy_score(y_test, y_test_pred)

        train_accuracies_avg += metrics.accuracy_score(y_train, y_train_pred)
    
    test_accuracies.append(test_accuracy_avg/10)
    train_accuracies.append(train_accuracies_avg/10)

plt.plot(MINIMUM_SAMPLE_SPLIT, test_accuracies, label='Testing Accuracy')
plt.plot(MINIMUM_SAMPLE_SPLIT, train_accuracies, label='Training Accuracy')

plt.scatter(MINIMUM_SAMPLE_SPLIT, test_accuracies, color='blue', marker='o')
plt.scatter(MINIMUM_SAMPLE_SPLIT, train_accuracies, color='orange', marker='o')

plt.legend()
plt.xlabel('Minimum Sample Split')
plt.ylabel('Accuracy')
plt.title('Testing and Training Accuracy vs Minimum Sample Split')
plt.grid(True)
plt.show()




**3.** We can observe that the training data accuracy is higher when the minimum number of samples required to split a node is lower, while the test data accuracy is low. This shows that the model overfits the training data, as it makes decisions based on few samples, making the decision tree very complex and deep, demonstrating poor generalization ability to new data.

As the minimum number of samples required to split a node increases, the training data accuracy decreases and the test data accuracy increases, which means the model generalizes better to new data and avoids overfitting. The difference between training and test accuracy decreases, reaching its lowest value when the minimum number of samples is 30.

Analyzing the graph, we notice that from 30 samples onwards, the model starts to decrease its accuracy in both training and test data, which shows that the model starts to underfit, meaning the decision tree becomes too simple to correctly predict the data.

In [None]:
#4.i
MAX_DEPTH = 3

X = df.drop(columns=['Outcome'],axis=1)
y = df['Outcome']

predictor = tree.DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=MAX_DEPTH)
predictor.fit(X, y)

figure = plt.figure(figsize=(12, 6))
tree.plot_tree(predictor, filled=True, feature_names=X.columns, class_names=["No Diabetes", "Diabetes"], impurity=False)
plt.show()


**4.ii.** The created decision tree has a depth of 3, which means it has 3 levels of decision, making the tree simpler and easier to interpret, avoiding overfitting. It identifies the presence of diabetes based on the provided dataset, associating a set of association rules between the variables to the probability of having diabetes.

Thus, we have the following association rules and the posterior probability that symbolizes the probability of having or not having diabetes conditioned on the parameters of the association rule:

Rule 1: If the glucose level is less than or equal to 127.5, age less than or equal to 28.5, and BMI is greater than 45.4, the probability of having diabetes is 75%, with 4 training samples. -> Leaf 2

Rule 2: If the glucose level is greater than 127.5, BMI is less than or equal to 29.95, and then glucose is greater than 145.5, the probability of having diabetes is 50%, with 35 training samples. -> Leaf 5

Rule 3: If the glucose level is greater than 127.5, BMI is greater than 29.95, and glucose is less than or equal to 157.5, the probability of having diabetes is 61%, with 115 training samples. -> Leaf 6

Rule 4: If the glucose level is greater than 127.5, BMI is greater than 29.95, and glucose is greater than 157.5, the probability of having diabetes is 87%, with 92 training samples. -> Leaf 7

In conclusion, based on our dataset, diabetes is characterized by a high glucose level associated with a high BMI, as we can see from association rules 2, 3, and 4, associated with leaves 5, 6, and 7, respectively. Regarding rule 1, the sample is too small to draw conclusions and may just be noise in the data.