# **Module 6: Descriptive and Predictive Modeling**
## **Exercise 1:** K-Nearest Neighbors in depth
### **Submitted by:** Vitor Oliveira de Souza (Z0963220P), Jorge De La Torre (DNI), Phoebe (DNI), Miguel Galán (DNI)
### **Date:** 13/02/2024

#### **A)** "from sklearn.datasets import load_breast_cancer" (basic)

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer

In [2]:
# Load dataset from sklearn datasets
dataset = load_breast_cancer()

In [3]:
# Get input data from dataset
X = pd.DataFrame(dataset.data)
# Get feature names
X.columns = dataset.feature_names
# Get output labels
y = pd.DataFrame(dataset.target)
y.columns = ['target']

#### 1) Describe the dataset in dimensions such as number of features, number of categories, and number of samples per category using Python.

In [4]:
# Take a look in input data
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
# Analyse input and output data shapes
X.shape, y.shape

((569, 30), (569, 1))

In [6]:
# Get labels names
target_names = dataset.target_names
y['labels'] = y['target'].map(lambda x: target_names[x])
y.groupby('labels').size()

labels
benign       357
malignant    212
dtype: int64

In [7]:
# Analyse input data distribution
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [12]:
# Visualise the data in 2D using PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from matplotlib.lines import Line2D
%matplotlib inline


# Create a function because we'll use it later to check the model's results
def print_pca(data, 
              y, 
              highlight_index=None,
              highlight_points=None,
              text='',
              dim_reducer=PCA(n_components=2),
              n_components = 2,
              scaler=MinMaxScaler()
             ):
    
    X_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
    
    # Perform PCA
    X_pca = dim_reducer.fit_transform(X_scaled)
    X_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'], index=X_scaled.index)
    X_pca['labels'] = [text+'malign' if target == 0 else text+'benign' for target in y['target']]

    # Plot the data
    fig, ax = plt.subplots(figsize=(15, 8))
    scatter = ax.scatter(x=X_pca['PC1'],
               y=X_pca['PC2'],
               c=y['target'],
               cmap='bwr',
               s=60,alpha=0.3)

    # Plot highlighted points based on the index list provided
    if highlight_index is not None:

        print(X_pca.loc[highlight_index].index)
        
        highlighted = ax.scatter(x=X_pca['PC1'].loc[highlight_index],
                                 y=X_pca['PC2'].loc[highlight_index],
                                 c='lime', marker='x', s=60, alpha=1)
        # Annotate highlighted points
        for i in highlight_index:
            ax.annotate(X_pca['labels'].loc[i], 
                        (X_pca['PC1'].loc[i], X_pca['PC2'].loc[i]),
                        textcoords="offset points", # Positioning of the text
                        xytext=(0,10), # Distance from the point to the text
                        ha='center') # Horizontal alignment
    
    # Highlight additional points specified directly that are not in the main dataset
    if highlight_points is not None:
        
        # Scale the highlight_points with the same scaled
        highlight_points_scaled = pd.DataFrame(scaler.transform(highlight_points), columns=highlight_points.columns, index=highlight_points.index)
        # Transform using PCA using the same pca transformer
        highlight_points_pca = dim_reducer.transform(highlight_points_scaled)
        
        # Plot the highlighted points
        for point in highlight_points_pca:
            ax.scatter(point[0], point[1], c='gold', edgecolor='black', marker='o', s=60, alpha=1)

    
    # Create custom legend
    classes = dataset.target_names
    legend_elements = [Line2D([0], [0], marker='o', color='w', label=classes[i],
                              markerfacecolor='b' if i==0 else 'r', markersize=10) for i in range(len(classes))]
    ax.legend(handles=legend_elements, loc="lower left", title="Classes")
        
    # Set titles and labels
    ax.set_title('PCA of Dataset')
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')

print_pca(X,y)

TypeError: PCA.fit_transform() missing 1 required positional argument: 'self'

In [9]:
def print_pca_3d(data,y):
    # Now visualize the data in 3d
    %matplotlib qt

    # Scale data
    scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    
    # Perform PCA
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(X_scaled)
    X_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2','PC3'])
    X_pca['label'] = ['malignant' if target == 0 else 'benign' for target in y['target']]
    
    # Plot the data
    fig, ax = plt.subplots(figsize=(15, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X_pca['PC1'],
               X_pca['PC2'],
               X_pca['PC3'],
               c=y['target'],
               cmap='bwr',
               s=60,alpha=0.5)

print_pca_3d(X,y)

#### Answer 1:
##### **Number of features**: The dataset comprises 30 features. These features might include various measurements or characteristics relevant to the study of breast cancer.
##### **Number of categories**: There are two categories within the dataset: Malignant and Benign. These categories represent the classification targets for our analysis, indicating whether samples are indicative of malignant (cancerous) or benign (non-cancerous) conditions.
##### **Number of samples in each category**: The dataset contains 212 samples classified as malignant. These represent cases where the condition is potentially harmful and requires detailed examination. Conversely, there are 357 samples identified as benign, indicating conditions that are generally not harmful.
##### **Inbalanced data**: An initial observation suggests that the dataset is somewhat imbalanced, with a greater number of benign samples compared to malignant ones. This imbalance could influence the performance of machine learning models, necessitating techniques such as weighted evaluation metrics to ensure model reliability across both categories.
##### **Presence of mixed datapoints/outliers**: Visual inspection, particularly through 2D and 3D projections with PCA, reveals that there is some degree of overlap between the categories. Points that is labeled with malign category are found within clusters of the benign category, indicating the potential presence of **outliers** or **challenging cases**.
# ------------------------------------------------------------------------------------------

#### 2) Represent the statistical support of every feature graphically, resorting to Matplotlib’s boxplot function. Are there any outliers that can be detected by simple visual inspection? If so, devise a handcrafted method to detect and isolate such examples.

In [None]:
# Let's check all the boxplots together
%matplotlib inline
scaler = StandardScaler()
X_standard_scaled = pd.DataFrame(scaler.fit_transform(X))
fig, ax = plt.subplots(figsize=(15, 8))
ax.boxplot(X_standard_scaled);

In [None]:
# Altough it's a widely distribuited data, it seems there're some points too far away from the median value
# Let's clean the outliers considering 5 times the IQR interval as a threashold
def get_outliers_indices(data, columns_outliers):
    indices_to_remove = []
    for column in columns_outliers:
        q1 = data[column].quantile(0.25)
        q3 = data[column].quantile(0.75)
        IQR = q3 - q1
        # Find the indices outside the acceptable range
        outliers = data[(data[column] < q1 - 5 * IQR) | (data[column] > q3 + 5 * IQR)].index
        indices_to_remove.extend(outliers)  # Use extend to flatten the list
    indices_to_remove = list(set(indices_to_remove))  # Remove duplicates
    return indices_to_remove
    
# Lets join X and y to clean them at the same time
df = X.join(y)
columns_outliers = [col for col in df.columns if col not in ['target', 'labels']]
outliers_index = get_outliers_indices(df,columns_outliers)
outliers_index
df_cleaned = df.drop(outliers_index)
df_cleaned.groupby('target').size()

In [None]:
# Let's print the outliers that were removed
print_pca(X,y,outliers_index,text='outlier: ')

#### Answer 2:
##### The boxplot visualization indicates a wide distribution across almost all features, with numerous data points lying outside the whiskers. These points are typically considered outliers in a conventional analysis context. However, our assessment suggests that these aren't outliers in the traditional sense but rather indicate the presence of subgroups within the data, each with distinct distributions.
##### This heterogeneity within the data is not necessarily detrimental; on the contrary, it could enrich our analysis by revealing underlying patterns that assist in classifying the dataset accurately. It highlights the complexity of the data and suggests that multiple factors may influence the outcomes we are trying to predict.

### Approach to Outlier Detection:
##### Despite recognizing the value of the wide data distribution, we identified certain points as extreme outliers, significantly distanced from the majority of data points in specific features. These extreme values could potentially skew our analysis and model training.
##### To address outliers, we applied a method focusing on the interquartile range (IQR). Specifically, we set a threshold at 5 times the IQR from the first and third quartiles. This approach aims to retain the inherent data structure and variability while excluding extreme outliers that could adversely affect our analysis.
##### We implemented a Python function to calculate the IQR for each feature and identify indices of data points lying beyond the acceptable range (5 times the IQR from the Q1 and Q3 quartiles). These indices represent the outliers we decided to remove.
##### **Important:** The threashold of 5 times IQR was set by visual inpecting of the features distributions.
# ------------------------------------------------------------------------------------------

#### 3) Repeat each of the experiments seen in class with the K-Nearest Neighbors model, providing arguments for each of the steps taken along the process, and commenting on the partial results obtained with the selected dataset. Please use as many performance metrics as needed to illustrate the particularities of the selected dataset (e.g. imbalanced classes).



In [None]:
# First we separate the input and output again (after outlier cleaning)
X_cleaned = df_cleaned.drop(columns=['target','labels'])
y_cleaned = df_cleaned[['target','labels']]

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# Now we create a splitter to split the data considering the classes
# If we simply split the data randomly, the unbalanced classes could be splitted unevenly in train and test
sss = StratifiedShuffleSplit(n_splits=1,test_size=0.2, random_state=0)

for train_index, test_index in sss.split(X_cleaned, y_cleaned):
    Xtrain, ytrain = X_cleaned.iloc[train_index], y_cleaned.iloc[train_index]
    Xtest, ytest = X_cleaned.iloc[test_index], y_cleaned.iloc[test_index]

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

# First, we construct a Pipeline comprising two key components: a Scaler and a KNN Classifier Model.
# Utilizing a Pipeline streamlines the preprocessing and modeling process. Specifically, after the Scaler is fitted to the training data,
# it can be directly applied to the test data without the need for re-fitting. This approach ensures consistency in data preprocessing
# between training and testing phases, which is crucial for evaluating the model's performance on new, unseen data.
# Adhering to this practice prevents data leakage and ensures that our model's performance metrics accurately reflect its ability to generalize to new data.
estimator = Pipeline([('Scaler',MinMaxScaler()),('KNN',KNeighborsClassifier())])

# Then we define a parameter grid to search over with grid search techniques
param_grid = {'KNN__n_neighbors':[3,5,7,9,11,13,15]}

# We create our model selector using Grid search and Cross validations with 10 folds
gscv = GridSearchCV(
    estimator,
    param_grid,
    scoring="accuracy",
    cv=3,
    verbose=5,
    n_jobs=-1
)

# Then we fit our models using train dataset and keep track on computational time
start_time = datetime.now()
gscv.fit(Xtrain,ytrain['target'])
total_time = datetime.now() - start_time

In [None]:
# Now let's check what was considered the best set of parameters:
gscv.best_params_

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Evaluate the model's performance on the test dataset, focusing on metrics such as accuracy, F1 score, ROC AUC, and the confusion matrix.

# Reporting computational time for model training and grid search
print(f'Total Time: \t{total_time} seconds')

# Displaying the average accuracy obtained from cross-validation
print(f"Avg acc (CV):\t{gscv.best_score_:.4f}")

# Fitting the best model found via grid search to the training data
best_estimator = gscv.best_estimator_
best_estimator.fit(Xtrain, ytrain['target'])

# Predicting class labels for the test set
ypred = best_estimator.predict(Xtest)

# Obtaining probability estimates for the test set (used for ROC AUC calculation)
yprob = best_estimator.predict_proba(Xtest)[:, 1]  # Extracting probabilities for the positive class

# Computing and displaying accuracy and F1 score for the test set
print(f'Acc (Test): \t{accuracy_score(ytest["target"], ypred):.4f}')
print(f'F1 (Test): \t{f1_score(ytest["target"], ypred):.4f}')

# Calculating and printing the ROC AUC score
roc_auc = roc_auc_score(ytest["target"], yprob)
print(f'AUC: \t\t{roc_auc:.4f}')

# Generating and displaying the confusion matrix
cm = confusion_matrix(ytest["target"], ypred)
display = ConfusionMatrixDisplay(cm)
display.plot()
plt.show()

In [None]:
# Let's check graphcally which examples couldn't correctly classified by our model
errors = ytest['target'][ytest['target'] != ypred].index
print_pca(X_cleaned,y_cleaned,highlight_index=errors,text='Errors: ')

#### Answer 3:
##### As showed by the results, accuracy and F1 scores have values around 0.97 which can be seen as good in Test dataset
##### We check F1 score to evaluate the model in inbalanced data, as accuracy can be biased on most common class predicions
##### We also checked the ROC AUC and the confision matrix and all metrics seen to be good for a prediction model
# ------------------------------------------------------------------------------------------

#### 4) Read the Scikit-learn library documentation and configure the automated validation script so that the GridSearchCV() function uses leave-one-out cross-validation instead of k-fold. Which conclusions can be drawn from the mean cross-validation scores and the test scores using a k-neighbor model with optimized k?

In [None]:
from sklearn.model_selection import LeaveOneOut

gscv_loo = GridSearchCV(
    estimator,
    param_grid,
    scoring='accuracy',
    cv=LeaveOneOut(),
    verbose=2,
    n_jobs=-1,
    return_train_score = True
)

start_time = datetime.now()
gscv_loo.fit(Xtrain,ytrain['target'])
total_time = datetime.now() - start_time

In [None]:
gscv_loo.best_params_

In [None]:
results = pd.DataFrame(gscv_loo.cv_results_)
results['mean_test_score']
#results.head(10)

In [None]:
# Evaluate the model's performance on the test dataset, focusing on metrics such as accuracy, F1 score, ROC AUC, and the confusion matrix.

# Reporting computational time for model training and grid search
print(f'Total Time: \t{total_time} seconds')

# Displaying the average accuracy obtained from cross-validation
print(f"Avg acc (CV):\t{gscv_loo.best_score_:.4f}")

# Fitting the best model found via grid search to the training data
best_estimator = gscv_loo.best_estimator_
best_estimator.fit(Xtrain, ytrain['target'])

# Predicting class labels for the test set
ypred = best_estimator.predict(Xtest)

# Obtaining probability estimates for the test set (used for ROC AUC calculation)
yprob = best_estimator.predict_proba(Xtest)[:, 1]  # Extracting probabilities for the positive class

# Computing and displaying accuracy and F1 score for the test set
print(f'Acc (Test): \t{accuracy_score(ytest["target"], ypred):.4f}')
print(f'F1 (Test): \t{f1_score(ytest["target"], ypred):.4f}')

# Calculating and printing the ROC AUC score
roc_auc = roc_auc_score(ytest["target"], yprob)
print(f'AUC: \t\t{roc_auc:.4f}')

# Generating and displaying the confusion matrix
cm = confusion_matrix(ytest["target"], ypred)
display = ConfusionMatrixDisplay(cm)
display.plot()
plt.show()

In [None]:
# Let's check graphcally which examples couldn't correctly classified by our model
errors = ytest["target"][ytest["target"] != ypred].index
print_pca(X_cleaned,y_cleaned,highlight_index=errors,text='Errors: ')

#### Answer 4:
##### answer
# ------------------------------------------------------------------------------------------

#### 5) Elaborate on the need for stratifying the cross-validation process analyzing the distribution of samples by class. If so, please show with empirical evidence what could occur if such a stratification was not performed, specially when decreasing the number of samples of the dataset.

In [None]:
from sklearn.model_selection import StratifiedKFold

cv_stratified = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

gscv_strat = GridSearchCV(
    estimator,
    param_grid,
    scoring="accuracy",
    cv=cv_stratified,
    verbose=2,
    n_jobs=-1
)

start_time = datetime.now()
gscv_strat.fit(Xtrain,ytrain["target"])
total_time = datetime.now() - start_time

In [None]:
gscv_strat.best_params_

In [None]:
# Evaluate the model's performance on the test dataset, focusing on metrics such as accuracy, F1 score, ROC AUC, and the confusion matrix.

# Reporting computational time for model training and grid search
print(f'Total Time: \t{total_time} seconds')

# Displaying the average accuracy obtained from cross-validation
print(f"Avg acc (CV):\t{gscv_strat.best_score_:.4f}")

# Fitting the best model found via grid search to the training data
best_estimator = gscv_strat.best_estimator_
best_estimator.fit(Xtrain, ytrain['target'])

# Predicting class labels for the test set
ypred = best_estimator.predict(Xtest)

# Obtaining probability estimates for the test set (used for ROC AUC calculation)
yprob = best_estimator.predict_proba(Xtest)[:, 1]  # Extracting probabilities for the positive class

# Computing and displaying accuracy and F1 score for the test set
print(f'Acc (Test): \t{accuracy_score(ytest["target"], ypred):.4f}')
print(f'F1 (Test): \t{f1_score(ytest["target"], ypred):.4f}')

# Calculating and printing the ROC AUC score
roc_auc = roc_auc_score(ytest["target"], yprob)
print(f'AUC: \t\t{roc_auc:.4f}')

# Generating and displaying the confusion matrix
cm = confusion_matrix(ytest["target"], ypred)
display = ConfusionMatrixDisplay(cm)
display.plot()
plt.show()

In [None]:
# Let's check graphcally which examples couldn't correctly classified by our model
errors = ytest["target"][ytest["target"] != ypred].index
print_pca(X_cleaned,y_cleaned,highlight_index=errors,text='Errors: ')

#### Answer 5:
##### In inbalanced datasets like this it's important to take the stratification into account during the cross validation process.
##### Cross validation is a process that randomly select a portion of the data to perform training and the rest leaves to evaluate the model.
##### In inbalanced data, when selecting the inbalanced dataset it's likely to select a training dataset without examples of all classes.
##### In binary classification like this, it's possible to select only one class for validation dataset and end up evaluating our model only in one class.
##### This validation dataset don't correspond to the real world data and the best model selected during cross validation will not perform well in production.
# ------------------------------------------------------------------------------------------

#### 6) Include in the set of hyper-parameters adjusted via cross-validation process the weights of the distance metric between samples according to the “weights” parameter of the model in Scikit-learn. Compute the model’s performance when distance metric weights are fine-tuned within cross-validation with respect to only tuning the number of neighbors (K).

In [None]:
param_grid = {'KNN__weights':['uniform','distance'],
               'KNN__n_neighbors':[3,5,7,9,11]}

param_grid_k = {'KNN__n_neighbors':[3,5,7,9,11]}

In [None]:
# Only tunning the model on n_neighbors

cv_stratified = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

gscv_only_k = GridSearchCV(
    estimator,
    param_grid_k,
    scoring="accuracy",
    cv=cv_stratified,
    verbose=2,
    n_jobs=-1
)

start_time = datetime.now()
gscv_only_k.fit(Xtrain,ytrain["target"])
total_time = datetime.now() - start_time

In [None]:
gscv_only_k.best_params_

In [None]:
# Evaluate the model's performance on the test dataset, focusing on metrics such as accuracy, F1 score, ROC AUC, and the confusion matrix.

# Reporting computational time for model training and grid search
print(f'Total Time: \t{total_time} seconds')

# Displaying the average accuracy obtained from cross-validation
print(f"Avg acc (CV):\t{gscv_only_k.best_score_:.4f}")

# Fitting the best model found via grid search to the training data
best_estimator = gscv_only_k.best_estimator_
best_estimator.fit(Xtrain, ytrain['target'])

# Predicting class labels for the test set
ypred = best_estimator.predict(Xtest)

# Obtaining probability estimates for the test set (used for ROC AUC calculation)
yprob = best_estimator.predict_proba(Xtest)[:, 1]  # Extracting probabilities for the positive class

# Computing and displaying accuracy and F1 score for the test set
print(f'Acc (Test): \t{accuracy_score(ytest["target"], ypred):.4f}')
print(f'F1 (Test): \t{f1_score(ytest["target"], ypred):.4f}')

# Calculating and printing the ROC AUC score
roc_auc = roc_auc_score(ytest["target"], yprob)
print(f'AUC: \t\t{roc_auc:.4f}')

# Generating and displaying the confusion matrix
cm = confusion_matrix(ytest["target"], ypred)
display = ConfusionMatrixDisplay(cm)
display.plot()
plt.show()

In [None]:
# Tunning the model in all parameters

cv_stratified = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

gscv_with_weights = GridSearchCV(
    estimator,
    param_grid,
    scoring="accuracy",
    cv=cv_stratified,
    verbose=2,
    n_jobs=-1
)

start_time = datetime.now()
gscv_with_weights.fit(Xtrain,ytrain["target"])
total_time = datetime.now() - start_time

In [None]:
gscv_with_weights.best_params_

In [None]:
# Evaluate the model's performance on the test dataset, focusing on metrics such as accuracy, F1 score, ROC AUC, and the confusion matrix.

# Reporting computational time for model training and grid search
print(f'Total Time: \t{total_time} seconds')

# Displaying the average accuracy obtained from cross-validation
print(f"Avg acc (CV):\t{gscv_with_weights.best_score_:.4f}")

# Fitting the best model found via grid search to the training data
best_estimator = gscv_with_weights.best_estimator_
best_estimator.fit(Xtrain, ytrain['target'])

# Predicting class labels for the test set
ypred = best_estimator.predict(Xtest)

# Obtaining probability estimates for the test set (used for ROC AUC calculation)
yprob = best_estimator.predict_proba(Xtest)[:, 1]  # Extracting probabilities for the positive class

# Computing and displaying accuracy and F1 score for the test set
print(f'Acc (Test): \t{accuracy_score(ytest["target"], ypred):.4f}')
print(f'F1 (Test): \t{f1_score(ytest["target"], ypred):.4f}')

# Calculating and printing the ROC AUC score
roc_auc = roc_auc_score(ytest["target"], yprob)
print(f'AUC: \t\t{roc_auc:.4f}')

# Generating and displaying the confusion matrix
cm = confusion_matrix(ytest["target"], ypred)
display = ConfusionMatrixDisplay(cm)
display.plot()
plt.show()

In [None]:
# Let's check graphcally which examples couldn't correctly classified by our model
errors = ytest["target"][ytest["target"] != ypred].index
print_pca(X_cleaned,y_cleaned,list(errors),text='Errors: ')

#### Answer 6:
##### answer
# ------------------------------------------------------------------------------------------

#### 7) Following the same approach as in the last section, enter the type of distance metric (“metric” parameter) within the cross-validation process. Evaluates the results and gains / losses of generalizability of the model.

In [None]:
param_grid = {'KNN__weights':['uniform','distance'],
               'KNN__metric':['cityblock','minkowski','cosine'],
               'KNN__p':[2,3,4,5,6,7],
               'KNN__n_neighbors':[3,5,7,9,11,13,15]}

In [None]:
# Tunning the model in all parameters

cv_stratified = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

gscv_complete = GridSearchCV(
    estimator,
    param_grid,
    scoring="accuracy",
    cv=cv_stratified,
    verbose=2,
    n_jobs=-1
)

start_time = datetime.now()
gscv_complete.fit(Xtrain,ytrain['target'])
total_time = datetime.now() - start_time

In [None]:
gscv_complete.best_params_

In [None]:
# Evaluate the model's performance on the test dataset, focusing on metrics such as accuracy, F1 score, ROC AUC, and the confusion matrix.

# Reporting computational time for model training and grid search
print(f'Total Time: \t{total_time} seconds')

# Displaying the average accuracy obtained from cross-validation
print(f"Avg acc (CV):\t{gscv_complete.best_score_:.4f}")

# Fitting the best model found via grid search to the training data
best_estimator = gscv_complete.best_estimator_
best_estimator.fit(Xtrain, ytrain['target'])

# Predicting class labels for the test set
ypred = best_estimator.predict(Xtest)

# Obtaining probability estimates for the test set (used for ROC AUC calculation)
yprob = best_estimator.predict_proba(Xtest)[:, 1]  # Extracting probabilities for the positive class

# Computing and displaying accuracy and F1 score for the test set
print(f'Acc (Test): \t{accuracy_score(ytest["target"], ypred):.4f}')
print(f'F1 (Test): \t{f1_score(ytest["target"], ypred):.4f}')

# Calculating and printing the ROC AUC score
roc_auc = roc_auc_score(ytest["target"], yprob)
print(f'AUC: \t\t{roc_auc:.4f}')

# Generating and displaying the confusion matrix
cm = confusion_matrix(ytest["target"], ypred)
display = ConfusionMatrixDisplay(cm)
display.plot()
plt.show()

In [None]:
# Let's check graphcally which examples couldn't correctly classified by our model
errors = ytest["target"][ytest["target"] != ypred].index
print_pca(X_cleaned,y_cleaned,list(errors),text='Errors: ')

#### Answer 7:
##### answer
# ------------------------------------------------------------------------------------------

#### Now let's print Train dataset and add the prediction errors on the plot to analyse if the train dataset has some different pattern that is causing this errors in test

In [None]:
# Let's print only test dataset with correspond errors
errors = ytest["target"][ytest["target"] != ypred].index
errors_points = Xtest.loc[errors]
print_pca(Xtrain,ytrain,highlight_points=errors_points)

#### Now let's check another dimension reducer to see if we have different conclusions

In [None]:
# Let's see using StandardScaler()
print_pca(X_cleaned,y_cleaned,highlight_index=errors,scaler=StandardScaler())

In [None]:
from sklearn.manifold import TSNE
# Now let's check with TSNE transformer
print_pca(X_cleaned,y_cleaned,highlight_index=errors,dim_reducer=TSNE(n_components=2))

In [None]:
# Now let's see with StandardScaler and TSNE
print_pca(X_cleaned,y_cleaned,highlight_index=errors,dim_reducer=TSNE(n_components=2),scaler=StandardScaler())

In [None]:
from sklearn.preprocessing import MaxAbsScaler

print_pca(X_cleaned,y_cleaned,highlight_index=errors,dim_reducer=TSNE(n_components=2),scaler=MaxAbsScaler())