In [1]:
                                   #Formative Assessment: Supervised Learning
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [9]:
                                              #Loading and Preprocessing

data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Display the head and tail of the dataset
head = df.head()
tail = df.tail()

print("Head of the dataset:")
print(head)

print("\nTail of the dataset:")
print(tail)


Head of the dataset:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst a

In [10]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [11]:
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


Missing values in each column:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [None]:
Handling Missing Values:

Step i take : Checking for missing values.
Reason: Missing values can negatively impact the performance of machine learning models.
However, the breast cancer dataset from sklearn does not contain any missing values,
so no further action is needed for this step.
    
Feature Scaling:

Step i take: Standardizing the features using StandardScaler.
Reason: Many machine learning algorithms assume that the data is centered around zero and has unit variance.
If features are on different scales, algorithms like SVM and k-NN may perform poorly because they rely on distance metrics
that are sensitive to the scale of the features. Standardizing the features ensures that each feature contributes equally to
the model.

In [12]:
scaler = StandardScaler()
features = df.drop(columns='target')
scaled_features = scaler.fit_transform(features)


In [13]:
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_df['target'] = df['target']


In [14]:
print(scaled_df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.768909      0.253732       -0.592687  -0.764464         3.283553   
4     1.750297     -1.151816        1.776573   1.826229         0.280372   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0          3.283515        2.652874             2.532475       2.217515   
1         -0.487072       -0.023846             0.548144       0.001392   
2          1.052926        1.363478             2.037231       0.939685   
3          3.402909        1.915897             1.451707       2.867383   
4          0.539340        1.371011             1.428493      -0.009560   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [15]:
                 # Classification Algorithm Implementation

#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = scaled_df.drop(columns='target')
y = scaled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f'Logistic Regression Accuracy: {accuracy_log_reg}')


Logistic Regression Accuracy: 0.9736842105263158


In [None]:
Logistic Regression is a linear model used for binary classification problems. 
It predicts the probability of the target variable belonging to a particular class. 
The model uses the logistic function to map predicted values to probabilities between 0 and 1.
Logistic Regression is suitable for the breast cancer dataset as it is a binary classification problem (malignant or benign).
It is simple, interpretable, and performs well when the relationship between the features and the target is linear.

In [19]:
                           # Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

y_pred_tree = decision_tree.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f'Decision Tree Accuracy: {accuracy_tree}')




Decision Tree Accuracy: 0.9473684210526315


In [None]:
Decision Trees are non-linear models that split the data into subsets based on the value of the features. 
Each node represents a decision based on a feature, and each branch represents the outcome of that decision.
Decision Trees are suitable for the breast cancer dataset as they can capture complex relationships between the features 
and the target. They are also easy to interpret and can handle both numerical and categorical data.

In [20]:
                        #Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

y_pred_forest = random_forest.predict(X_test)
accuracy_forest = accuracy_score(y_test, y_pred_forest)
print(f'Random Forest Accuracy: {accuracy_forest}')


Random Forest Accuracy: 0.9649122807017544


In [None]:
Random Forest is an ensemble method that combines multiple decision trees to improve the model's performance. 
It reduces overfitting by averaging the results of several trees.
Random Forest is suitable for the breast cancer dataset because it improves the accuracy and robustness of the model by 
combining multiple decision trees, thus capturing more complex relationships in the data.

In [22]:
                              #kNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'k-NN Accuracy: {accuracy_knn}')

k-NN Accuracy: 0.9473684210526315


In [None]:
SVM is a powerful classification method that finds the hyperplane that best separates the data into different classes. 
It works well for both linear and non-linear data by using kernel functions.
SVM is suitable for the breast cancer dataset because it can handle high-dimensional spaces and complex decision 
boundaries, making it effective for datasets with many features like this one.

In [21]:
                      #Support Vector Machine (SVM)

from sklearn.svm import SVC

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {accuracy_svm}')

SVM Accuracy: 0.956140350877193


In [None]:
k-NN is a simple, instance-based learning algorithm that classifies a sample based on the majority class of its k-nearest neighbors. 
It uses distance metrics (e.g., Euclidean) to find the nearest neighbors.
k-NN is suitable for the breast cancer dataset because it is a simple and intuitive method that can capture local 
structures in the data. However, it may not perform well on high-dimensional data without proper feature scaling 
(which we have already done).

In [None]:
Logistic Regression: Achieved the highest accuracy of 0.9736842105263158.
Random Forest: The second-best performer with an accuracy of 0.9649122807017544.
SVM: Close behind with an accuracy of 0.956140350877193.
Decision Tree: Achieved an accuracy of 0.9473684210526315.
k-NN: Also achieved an accuracy of 0.9473684210526315.
Best and Worst Performing Algorithms:-
Best Performing Algorithm: Logistic Regression with an accuracy of 0.9736842105263158.
Worst Performing Algorithms: Decision Tree and k-NN, both with an accuracy of 0.9473684210526315.