In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

##**Q1 :Loading and Preprocessing:**

> ● Load the breast cancer dataset from sklearn.

> ● Preprocess the data to handle any missing values and perform necessary feature scaling.

> ● Explain the preprocessing steps you performed and justify why they are
necessary for this dataset.

###**Step 1: Loading Dataset**

In [2]:
breastcancer = load_breast_cancer()
'''
Since the breast cancer dataset from sklearn is in disctionary format,
we need to convert it into Pandas DataFrame format to make it easier
for viewing and perform preprocessing techniques.
'''
df=pd.DataFrame(data=breastcancer.data,columns=breastcancer.feature_names)
df['target']=breastcancer.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


###**Step 2 : Overall Information about the dataset**

In [3]:
'''
Display the overall informations about the dataset such as shape,columns etc and statistical measurements using describe() method.
This will help us to identify whether the dataset loaded correctly and an overall idea about the columns.The method describe()
provides a quick numerical summary of the dataset.
'''

print("Shape of dataset : ",df.shape)
print("\nColumns of dataset : ",df.columns)
df.describe()

Shape of dataset :  (569, 31)

Columns of dataset :  Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


###**Step 3 : Handling Missing Values, Duplicates and Outliers**

In [4]:
'''
Check for missing values
'''

df.isna().sum()

Unnamed: 0,0
mean radius,0
mean texture,0
mean perimeter,0
mean area,0
mean smoothness,0
mean compactness,0
mean concavity,0
mean concave points,0
mean symmetry,0
mean fractal dimension,0


In [5]:
#Check for duplicate

duplicate_rows=df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [6]:
# Finding outliers and handle it
num_cols=df.select_dtypes(include=['int64','float64']).columns

# find the Q1, Q2, Q3 and IQR for numerical columns
Q1=df[num_cols].quantile(0.25)
Q2=df[num_cols].quantile(0.50)
Q3=df[num_cols].quantile(0.75)
IQR=Q3-Q1

outlier_counts = {}
for col in num_cols:
    lower_bound = Q1[col] - 1.5 * IQR[col]
    upper_bound = Q3[col] + 1.5 * IQR[col]
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_counts[col] = len(outliers)



# Convert dictionary to DataFrame
outlier_summary = pd.DataFrame(list(outlier_counts.items()), columns=['Feature', 'Outlier Count'])


# Display table
outlier_summary

Unnamed: 0,Feature,Outlier Count
0,mean radius,14
1,mean texture,7
2,mean perimeter,13
3,mean area,25
4,mean smoothness,6
5,mean compactness,16
6,mean concavity,18
7,mean concave points,10
8,mean symmetry,15
9,mean fractal dimension,15


In [7]:
outlier_aftercapping_counts = {}
for col in num_cols:
    lower_limit = Q1[col] - 1.5 * IQR[col]
    upper_limit = Q3[col] + 1.5 * IQR[col]
    df[col] = df[col].clip(lower_limit, upper_limit)
    outlier_aftercapping_counts[col] = len(outliers)

# Convert dictionary to DataFrame
outlier_aftercapping_summary = pd.DataFrame(list(outlier_aftercapping_counts.items()), columns=['Feature', 'Outlier Count'])


# Display table
outlier_aftercapping_summary

Unnamed: 0,Feature,Outlier Count
0,mean radius,0
1,mean texture,0
2,mean perimeter,0
3,mean area,0
4,mean smoothness,0
5,mean compactness,0
6,mean concavity,0
7,mean concave points,0
8,mean symmetry,0
9,mean fractal dimension,0


###**Step 4 : Encoding and Scaling**

In [8]:
#checking for categorical value to check whether encoding is needed. There is no need for encoding as there is no categorical values
cat_cols=df.select_dtypes(include=['object']).columns
print(cat_cols)

Index([], dtype='object')


In [9]:
#Performing feature scaling for numerical values. Here we use standard scalar because the dataset have only few outliers.

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[num_cols])
scaled_df=pd.DataFrame(scaled_features,columns=num_cols)
scaled_df['target']=df['target']
print("\nScaled DataFrame:\n")
scaled_df


Scaled DataFrame:



Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1.176800,-2.121200,1.357375,1.184085,1.618861,2.541404,2.647422,2.620973,2.348535,2.511708,...,-1.375159,2.439568,2.287627,1.344848,2.641905,2.246192,2.296076,2.443918,2.225247,0
1,1.949929,-0.354875,1.795991,2.249396,-0.842995,-0.498189,-0.000497,0.574944,0.017882,-0.925449,...,-0.370048,1.631542,2.287627,-0.377098,-0.443388,-0.137634,1.087084,-0.234408,0.355314,0
2,1.686226,0.476899,1.670052,1.846217,0.975239,1.148680,1.496076,2.110330,1.004666,-0.407692,...,-0.019582,1.434234,1.807751,0.546654,1.223448,0.920718,1.955000,1.369057,0.265197,0
3,-0.791983,0.268955,-0.606410,-0.831485,2.737521,2.541404,2.091997,1.506601,2.521318,2.517947,...,0.140773,-0.245395,-0.593838,2.595949,2.641905,2.119474,2.175786,2.443918,2.482456,0
4,1.866023,-1.174698,1.891531,2.154338,0.295047,0.599453,1.504202,1.482665,0.006363,-0.588595,...,-1.484267,1.424838,1.525780,0.232758,-0.314469,0.665254,0.729259,-0.951602,-0.410683,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.246595,0.749377,2.191180,2.250380,1.077633,0.256947,2.125856,2.402886,-0.312326,-0.994068,...,0.124241,1.860170,2.287627,0.394190,-0.270329,0.719489,1.629151,-1.516438,-0.763016,0
565,1.818077,2.150007,1.722165,2.036334,0.112199,0.003614,0.772845,1.312717,-0.212495,-1.134424,...,2.083216,1.512531,1.854045,-0.699963,-0.404152,0.267358,0.733827,-0.565135,-1.062155,0
566,0.760269,2.109375,0.727678,0.715676,-0.856891,-0.018680,0.075483,0.118824,-0.834515,-0.955080,...,1.400468,0.626208,0.576760,-0.821037,0.417005,0.362650,0.414069,-1.222872,-0.321817,0
567,1.958919,2.408144,2.108668,2.049446,1.574978,2.541404,2.647422,2.751293,2.264063,1.178328,...,2.276634,2.439568,2.043428,1.470406,2.641905,2.619251,2.289985,2.249756,2.482456,0


For Loading and Preprocessing the following steps are performed,

**Step 1:** Loaded the load_breast_cancer from sklearn library which is an inbuilt dataset provided by sklearn for Machine Learning practice. The breast cancer dataset is a classic and very easy binary classification dataset. Since the loaded dataset is in dictionary format, we need to convert them into Pandas DataFrame format to make it easier for viewing and perform preprocessing techniques. Then add the target column 'MedHouseVal' to the DataFrame to make the features and target together.

**Step 2 :** Display the overall informations about the dataset such as shape,columns etc and statistical measurements using describe() method. This will help us to identify whether the dataset loaded correctly and an overall idea about the columns.The method describe() provides a quick numerical summary of the dataset.

**Step 3 :** Checking for the duplicate value using isna() method. If there is larger amount of missing data , it can cause dataset bias which lead to low performance. Here there is no missing datas found.Duplicates also not found. After missing data handling, next step will be the handling of outliers. Here I used the Inter Quartile Range(IQR) method to check and manage outliers. Outliers will affect the model perfoemance and lead to bias. Capping keeps the data within reasonable limits without losing valuable samples.

**Step 4 :** Here Step 1 - 3 performing the data cleaning peocess in preprocessing. Next we need to perform data tranformation to convert the data into a suitable format for machine learning algorithms. In transformation categorical values and numerical values are treated separately. Encoding is used for categorical data.It convert categorical data to numerical values. So here first check if categorical data is available, here it is zero. So no need to encode Scaling is applied on numerical features inorder to make them in a unique format.Scaling brings all features to a similar scale, improving model stability. Here I performed standardscalar for scaling.

##**Q2 : Classification Algorithm Implementation:**

Implement the following five classification algorithms:

> 1. Logistic Regression

> 2. Decision Tree Classifier

> 3. Random Forest Classifier

> 4. Support Vector Machine (SVM)

> 5. k-Nearest Neighbors (k-NN)

For each algorithm, provide a brief description of how it works and why it might be suitable for this dataset.

In [10]:
# Define x and y from the scaled dataset
x=scaled_df.drop('target',axis=1)
y=scaled_df['target']

# Split the data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Logistic Regression
logistic_regression=LogisticRegression()
logistic_regression.fit(x_train,y_train)
y_pred_logistic=logistic_regression.predict(x_test)


####**1. Logistic Regression**

**Logistic Regression is a statistical method used for predicting categorical outcomes.It models the probability of the target class using the sigmoid function, predicting binary outcomes efficiently. The Breast Cancer dataset is a binary classification problem where the goal is to classify tumors as malignant (0) or benign (1). Logistic Regression is well-suited for such binary outcomes.**

In [11]:
# DecisionTree Classifier
decision_tree=DecisionTreeClassifier()
decision_tree.fit(x_train,y_train)
y_pred_decision=decision_tree.predict(x_test)

####**2. Decision Tree**

**A Decision Tree is a supervised learning algorithm used for both classification and regression tasks. It splits the data based on feature values to form a tree-like structure of decisions. Decision Trees handle non-linear relationships and are easy to interpret. They are suitable for the Breast Cancer dataset because they can capture complex, non-linear relationships between features and the target variable.**

In [12]:
# Random Forest Classifier
random_forest=RandomForestClassifier()
random_forest.fit(x_train,y_train)
y_pred_random=random_forest.predict(x_test)

####**3. Random Forest Classifier**

**Random Forest is an ensemble of decision trees that improves performance and reduces overfitting by averaging predictions.Random Forest Classifier is well-suited for the Breast Cancer dataset because it reduces overfitting, handles complex relationships between features, and provides high accuracy in predicting whether a tumor is malignant or benign.**

In [13]:
# Support Vector Machine
svm=SVC()
svm.fit(x_train,y_train)
y_pred_svm=svm.predict(x_test)

####**4. Support Vector Machine**

**SVM finds the optimal hyperplane that best separates classes in high-dimensional space. It performs well on complex but small to medium-sized datasets.Support Vector Machine (SVM) is well-suited for the Breast Cancer dataset because it effectively separates the two classes—malignant and benign—by finding the optimal boundary between them. Its ability to handle high-dimensional data and maintain strong performance even with many features makes it ideal for medical classification tasks like this.**

In [14]:
# k-Nearest Neighbors (k-NN)
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
y_pred_knn=knn.predict(x_test)

####**5. k-Nearest Neighbors (k-NN)**

**k-NN classifies a sample based on the majority label among its nearest neighbors. It's simple and effective but sensitive to scaling.k-Nearest Neighbors (k-NN) is suitable for the Breast Cancer dataset because it classifies tumors based on similarity, meaning samples with similar feature values are likely to have the same diagnosis. Its simplicity and ability to capture local patterns make it effective for distinguishing between malignant and benign cases.**

##**Q3 : Model Comparison :**


> ● Compare the performance of the five classification algorithms.

> ● Which algorithm performed the best and which one performed the worst?








In [15]:
results = {
    "Logistic Regression": accuracy_score(y_test, y_pred_logistic),
    "Decision Tree": accuracy_score(y_test, y_pred_decision),
    "Random Forest": accuracy_score(y_test, y_pred_random),
    "SVM": accuracy_score(y_test, y_pred_svm),
    "k-NN": accuracy_score(y_test, y_pred_knn)
}

results_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
results_df

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.982456
1,Decision Tree,0.929825
2,Random Forest,0.964912
3,SVM,0.973684
4,k-NN,0.95614


**Among all the models tested, the Logistic Regression achieved the highest accuracy, making it the best-performing model for this dataset. Its ability to find the optimal boundary between malignant and benign cases helped it generalize well to unseen data. On the other hand, the Decision Tree performed the worst, showing slightly lower accuracy due to its tendency to overfit the training data. While it captured detailed patterns, it did not generalize as effectively as the other models.**