<a href="https://colab.research.google.com/github/yagnikta/HandsOnML/blob/main/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np #imported numpy
from sklearn.datasets import load_breast_cancer #imported dataset
from sklearn.model_selection import train_test_split #imported train test split class
from sklearn.preprocessing import StandardScaler #imported standarscalar class for standardization of data
from sklearn.pipeline import make_pipeline #imported make_pipeline class for making a pipeline of process
from sklearn.linear_model import LogisticRegression #imported base model for prediction
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix #imported various matrics classes for measuring the performance of the trained model.
import matplotlib.pyplot as plt #imported matplotlib for ploting the results
import seaborn as sns #imported advanced library for plotting

In [None]:
# 1. Load the dataset
# This is a real-world dataset for breast cancer classification.
# The features are various medical measurements, and the target is whether
# the tumor is malignant (1) or benign (0).
breast_cancer_data = load_breast_cancer() #loaded the data by calling the function
X = breast_cancer_data.data # features of the data
y = breast_cancer_data.target # target of the data

In [None]:
# 2. Split the data into training and testing sets
# We use a 70/30 split. The model will be trained on the training data
# and evaluated on the unseen testing data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) #split the data 70:30 ratio for training:testing

In [None]:
# 3. Create a data processing and model pipeline
# It's a best practice to scale the data before applying a linear model.
# StandardScaler ensures all features have a mean of 0 and a standard deviation of 1.
# The pipeline ensures that this scaling is consistently applied to both training
# and testing data.
pipeline = make_pipeline(StandardScaler(), LogisticRegression()) # Made pipeline for function to executing whole function chain

In [None]:
# 4. Train the model
# The .fit() method trains the model on the training data.
print("Training the Logistic Regression model...")
pipeline.fit(X_train, y_train) # Train the model with training data

In [None]:
# 5. Make predictions on the test set
# The .predict() method uses the trained model to make predictions on new, unseen data.
y_pred = pipeline.predict(X_test) # Predict the data based on test data
y_pred_proba = pipeline.predict_proba(X_test)[:, 1] # Get probabilities for the positive class

In [None]:
# 6. Evaluate the model's performance
# We use several key classification metrics to get a full picture of the model's performance.
accuracy = accuracy_score(y_test, y_pred) # Accuracy of the model
precision = precision_score(y_test, y_pred) # Precision of the model
recall = recall_score(y_test, y_pred) # Recall of the model
f1 = f1_score(y_test, y_pred) # F1 Score of the model
conf_matrix = confusion_matrix(y_test, y_pred) # Confusion Matrix of the model

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

In [None]:
# 7. Visualize the Confusion Matrix
# A confusion matrix is a great way to see where the model is making errors.
# The rows represent the actual classes, and the columns represent the predicted classes.
# The numbers tell you how many instances were correctly or incorrectly classified.
# e.g., A[0,0] = True Negatives, A[1,1] = True Positives
plt.figure(figsize=(8, 6)) # plot the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=breast_cancer_data.target_names,
            yticklabels=breast_cancer_data.target_names) # heatmap of the confustion matrix
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()