## Classification

In [None]:
# Importing the necessary libraries
from sklearn.datasets import load_breast_cancer  # Importing the dataset module from scikit-learn
# Importing the module for splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split  
import plotly.graph_objects as go
from sklearn.metrics import roc_curve

# Loading the breast cancer dataset into a variable called 'cancer'
cancer = load_breast_cancer()

In [None]:
cancer.feature_names

In [None]:
# Splitting the dataset into training and testing sets
# The 'train_test_split' function takes four arguments:
# - 'cancer.data': The features of the breast cancer dataset
# - 'cancer.target': The target variable (labels) of the breast cancer dataset
# - 'stratify=cancer.target': Ensures that the target variable is evenly distributed in the training and testing sets
# - 'random_state=1': Sets a seed for random number generation, ensuring reproducibility

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=1)


In [None]:
# Importing the 'SVC' class from the 'sklearn.svm' module
from sklearn.svm import SVC

# Creating an instance of the 'SVC' class and setting 'probability=True'
# NOTE: This allows the model to output probability estimates for each class
svc = SVC(probability=True)

# Training the support vector machine model with the training data
# The 'fit' method takes two arguments:
# - 'X_train': The input features of the training set
# - 'y_train': The target variable (labels) of the training set
svc.fit(X_train, y_train)

# Evaluating the trained model on the testing data
# The 'score' method calculates the accuracy of the model on the test set
# It takes two arguments:
# - 'X_test': The input features of the test set
# - 'y_test': The target variable (labels) of the test set
svc.score(X_test, y_test)


In [None]:
# Importing the 'GridSearchCV' class from the 'sklearn.model_selection' module
from sklearn.model_selection import GridSearchCV

# Defining a parameter grid for the hyperparameters of the SVM model.
# The grid contains multiple dictionaries, with each dictionary specifying a set of hyperparameters to be tested.
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# Creating an instance of the 'GridSearchCV' class
# This class performs an exhaustive search over specified parameter values for an estimator
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=10)

# Training the support vector machine model using the training data,
# while searching for the best combination of hyperparameters
# The 'fit' method takes two arguments:
# - 'X_train': The input features of the training set
# - 'y_train': The target variable (labels) of the training set
grid.fit(X_train, y_train)

# Evaluating the best model obtained from the grid search on the test data
# The 'score' method calculates the accuracy of the model on the test set
# It takes two arguments:
# - 'X_test': The input features of the test set
# - 'y_test': The target variable (labels) of the test set
grid.score(X_test, y_test)


The provided code snippet calculates the false positive rate (FPR), true positive rate (TPR), and thresholds for the receiver operating characteristic (ROC) curve. The ROC curve is a graphical representation of the performance of a binary classification model at different classification thresholds.

The `roc_curve` function takes two arguments:
- `y_test`: The true class labels for the test set.
- `y_pred_probs`: The predicted probabilities of the positive class (class 1) for each sample in the test set.

The function returns three arrays:
- `fpr` (false positive rate): An array of shape (num_thresholds,) containing the false positive rates corresponding to different classification thresholds.
- `tpr` (true positive rate or sensitivity): An array of shape (num_thresholds,) containing the true positive rates corresponding to different classification thresholds.
- `thresholds`: An array of shape (num_thresholds - 1,) containing the classification thresholds used to compute the FPR and TPR.

These values can be used to plot the ROC curve and evaluate the performance of the classification model based on its ability to discriminate between the positive and negative classes.

In [None]:
# Predicting the probabilities of the positive class for each sample in the test set using the trained support vector machine model
y_pred_probs = svc.predict_proba(X_test)[:, 1]

# Predicting the class labels for each sample in the test set using the trained support vector machine model
y_pred = svc.predict(X_test)

In [None]:
# Computing the false positive rate, true positive rate, and thresholds for the receiver operating characteristic (ROC) curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)

In [None]:
# Creating a scatter plot for the Receiver Operating Characteristic (ROC) curve
fig = go.Figure(data=go.Scatter(x=fpr, y=tpr, mode='lines'))

# Updating the layout of the plot
fig.update_layout(
    title='Receiver Operating Characteristic (ROC)',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    showlegend=False
)

# Displaying the plot
fig.show()



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a heatmap from the confusion matrix using Plotly
fig = go.Figure(data=go.Heatmap(z=cm, x=[0, 1], y=[0, 1]))
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted label', yaxis_title='True label')
fig.show()

##  Exercise: Use another Classification