In [76]:
import numpy as np
import pandas as pd
import tensorflow.keras as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

### Importing the data

Importing the Breast Cancer dataset and performing a little exploration to see what we are dealing with.

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
df.duplicated().sum()

0

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [7]:
df.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,0.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,


Overall, the dataset is quite clean. There is, however, a mysterious feature called "Unnamed", which I will drop. We will also have to encode the target feature, as it is a categorical variable.

### Encoding the Target variable

Since the target variable 'Diagnosis' is of categorical nature, we have to represent the categorical values as numbers. Here, I have used 1 to represent 'M', and used 0 to represent 'B'.

In [19]:
def encode(x):
    if x == 'M':
        x = 1
    else:
        x = 0
    return x

df['diagnosis'] = df['diagnosis'].apply(encode)

In [21]:
df['diagnosis'].value_counts()

diagnosis
0    357
1    212
Name: count, dtype: int64

### Scaling the Dataset

If features have different range of values that they can have, then finding the optimal parameters can become inefficient as the model can take quite a while to converge especially if you are dealing with a huge dataset. To ensure that the model converges quickly, we scale down the values of the features into comparable ranges.

In [23]:
y = df['diagnosis']
X = df.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1)

In [26]:
X_scaled = StandardScaler().fit_transform(X)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

### Training the Neural Network

I am using a relatively simple Neural Network with 2 hidden layers and 1 output layer. The hidden layers use ReLU as their activation function, meanwhile the output layer uses sigmoid.

In [65]:
model = tf.models.Sequential()
model.add(tf.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
model.add(tf.layers.Dense(16, activation='relu'))
model.add(tf.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [66]:
model.fit(X_train, y_train, epochs=12, batch_size=1, verbose=1)

Epoch 1/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 946us/step - accuracy: 0.8863 - loss: 0.3372
Epoch 2/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 955us/step - accuracy: 0.9506 - loss: 0.1420
Epoch 3/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.9905 - loss: 0.0439
Epoch 4/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 886us/step - accuracy: 0.9864 - loss: 0.0557
Epoch 5/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 873us/step - accuracy: 0.9890 - loss: 0.0383
Epoch 6/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 923us/step - accuracy: 0.9870 - loss: 0.0248
Epoch 7/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/step - accuracy: 0.9894 - loss: 0.0389
Epoch 8/12
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 922us/step - accuracy: 0.9893 - loss: 0.0259
Epoch 9/12
[1m398/398[

<keras.src.callbacks.history.History at 0x184a3356b90>

### Evaluating the Model

I will be using some evaluation metrics such as precision, recall, f1-score and accuracy. I will also generate a classification report to see how well the model performs for each category.

In [67]:
pred = model.predict(X_test)
pred = [1 if y>=0.5 else 0 for y in pred]





[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


#### Classification Report

In [68]:
print("Classification Report")
print(classification_report(y_test, pred))

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       108
           1       0.98      0.98      0.98        63

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171



#### Confusion Matrix

In [69]:
print("Confusion Matrix")
print(confusion_matrix(y_test, pred))

Confusion Matrix
[[107   1]
 [  1  62]]


#### Evaluation Metrics

In [70]:
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')

Precision: 0.9841269841269841
Recall: 0.9841269841269841
F1 Score: 0.9841269841269841
Accuracy: 0.9883040935672515


The model performed really well, for both the categories and posted impressive scores.

### Saving the Model

In [74]:
model.save('model.keras')