# Breast Cancer Classification

Classify whether the cancer is benign or malignant

Dataset source : <https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler


Import Dataset

In [None]:
df = pd.read_csv("dataset/wdbc.csv")
df.head()

In [None]:
df.info()

We can see that all columns are numeric except the diagnosis, so the next step is we going to encode this column to numeric

Encode the diagnosis column using Label Encoder

In [None]:
encoder = LabelEncoder()

df['diagnosis'] = encoder.fit_transform(df['diagnosis'])
df.head(10)

Drop the id column because its doesn't provide any useful information

In [None]:
X = df.drop(["id", "diagnosis"], axis=1)
y = pd.DataFrame(df["diagnosis"].copy())

Split the training and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scale the data using Standard Scaler

In [None]:
scaler = StandardScaler()

In [None]:
pca = PCA(n_components=2)

X_train_pca = X_train
X_train_pca = scaler.fit_transform(X_train_pca)
X_train_pca = pca.fit_transform(X_train_pca)
X_train_pca = pd.DataFrame(X_train_pca, columns = ["PC1", "PC2"])

In [None]:
y_train_pca = y_train.merge(X_train_pca, how="inner", left_index=True, right_index=True)

We can visualize the data that has been dimensionally reduced by PCA

In [None]:
mydict = {"benign": 'red',
          "malignant": 'blue'}

fig, ax = plt.subplots(figsize=(15, 6))
for l,d in y_train_pca.groupby('diagnosis'):
    if l == 0:
        l="benign"
    else:
        l="malignant"
    d.plot.scatter(x='PC1',y='PC2', s=50, label=l, c=mydict[l], ax=ax)
plt.legend()
plt.show()

In [None]:
naive_bayes = GaussianNB()

In [None]:
clf = Pipeline([
    ("scaler", scaler),
    ("pca", pca),
    ("clf", naive_bayes)
])

clf.fit(X_train, y_train.values.ravel())

In [None]:
train_accuracy = clf.score(X_train, y_train)
print(f'{train_accuracy:.3f}')

In [None]:
test_accuracy = clf.score(X_test, y_test)
print(f'{test_accuracy:.3f}')