In [56]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# Import dataset



In [57]:
df = pd.read_csv('./data/breast-cancer-diagnostic.shuf.lrn.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       285 non-null    int64  
 1   class                    285 non-null    bool   
 2   radiusMean               285 non-null    float64
 3    textureMean             285 non-null    float64
 4    perimeterMean           285 non-null    float64
 5    areaMean                285 non-null    float64
 6    smoothnessMean          285 non-null    float64
 7    compactnessMean         285 non-null    float64
 8    concavityMean           285 non-null    float64
 9    concavePointsMean       285 non-null    float64
 10   symmetryMean            285 non-null    float64
 11   fractalDimensionMean    285 non-null    float64
 12   radiusStdErr            285 non-null    float64
 13   textureStdErr           285 non-null    float64
 14   perimeterStdErr         2

In [58]:
df.head(5)

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,886452,True,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,...,16.39,22.07,108.1,826.0,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
1,84348301,True,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
2,9012795,True,21.37,15.1,141.3,1386.0,0.1001,0.1515,0.1932,0.1255,...,22.69,21.84,152.1,1535.0,0.1192,0.284,0.4024,0.1966,0.273,0.08666
3,894326,True,18.22,18.87,118.7,1027.0,0.09746,0.1117,0.113,0.0795,...,21.84,25.0,140.9,1485.0,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
4,867387,False,15.71,13.93,102.0,761.7,0.09462,0.09462,0.07135,0.05933,...,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071


# Check for missing values

In [59]:
df.isnull().sum()

ID                         0
class                      0
radiusMean                 0
 textureMean               0
 perimeterMean             0
 areaMean                  0
 smoothnessMean            0
 compactnessMean           0
 concavityMean             0
 concavePointsMean         0
 symmetryMean              0
 fractalDimensionMean      0
 radiusStdErr              0
 textureStdErr             0
 perimeterStdErr           0
 areaStdErr                0
 smoothnessStdErr          0
 compactnessStdErr         0
 concavityStdErr           0
 concavePointsStdErr       0
 symmetryStdErr            0
 fractalDimensionStdErr    0
 radiusWorst               0
 textureWorst              0
 perimeterWorst            0
 areaWorst                 0
 smoothnessWorst           0
 compactnessWorst          0
 concavityWorst            0
 concavePointsWorst        0
 symmetryWorst             0
 fractalDimensionWorst     0
dtype: int64

In [60]:
print(f'Missing values: {df.isnull().sum().any()}')

Missing values: False


# Drop highly correlated values


In [61]:
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
ID,1.0,0.040956,0.072465,0.094344,0.066219,0.068592,-0.026571,-0.029394,-0.015583,0.009041,...,0.075656,0.029582,0.071537,0.0717,-0.043785,-0.037185,-0.039384,-0.00269,0.014748,-0.085256
class,0.040956,1.0,0.717735,0.358188,0.733731,0.712221,0.375871,0.605207,0.740842,0.798805,...,0.773474,0.440994,0.782988,0.747866,0.479113,0.61976,0.714599,0.831813,0.504957,0.385391
radiusMean,0.072465,0.717735,1.0,0.271838,0.997591,0.990831,0.105925,0.477841,0.681792,0.815603,...,0.974157,0.277654,0.967159,0.950971,0.114376,0.405025,0.550551,0.754991,0.232632,0.019499
textureMean,0.094344,0.358188,0.271838,1.0,0.278902,0.271158,-0.075973,0.206442,0.278867,0.23982,...,0.287336,0.907687,0.296806,0.276099,0.067624,0.267215,0.306646,0.265895,0.150946,0.128113
perimeterMean,0.066219,0.733731,0.997591,0.278902,1.0,0.99005,0.146043,0.532711,0.724682,0.847265,...,0.975218,0.285486,0.974047,0.952925,0.147413,0.449759,0.589009,0.783295,0.262964,0.065629
areaMean,0.068592,0.712221,0.990831,0.271158,0.99005,1.0,0.120772,0.485747,0.695692,0.826119,...,0.972201,0.268507,0.966683,0.967764,0.128921,0.398743,0.546514,0.74919,0.229773,0.031199
smoothnessMean,-0.026571,0.375871,0.105925,-0.075973,0.146043,0.120772,1.0,0.658907,0.502683,0.514487,...,0.176125,0.00976,0.204702,0.181194,0.836374,0.502851,0.448547,0.496176,0.474456,0.555582
compactnessMean,-0.029394,0.605207,0.477841,0.206442,0.532711,0.485747,0.658907,1.0,0.897969,0.835257,...,0.523907,0.242741,0.579419,0.515523,0.590983,0.865807,0.816981,0.802274,0.587927,0.706006
concavityMean,-0.015583,0.740842,0.681792,0.278867,0.724682,0.695692,0.502683,0.897969,1.0,0.939265,...,0.712615,0.31846,0.756668,0.708727,0.48594,0.792357,0.896479,0.884722,0.505348,0.55049
concavePointsMean,0.009041,0.798805,0.815603,0.23982,0.847265,0.826119,0.514487,0.835257,0.939265,1.0,...,0.842378,0.278622,0.869964,0.835321,0.472494,0.692407,0.78764,0.922834,0.469854,0.415536


In [62]:
correlation_drop_threshold = 0.9
np.where(np.abs(correlation_matrix) > 0.9)
# highly_correlated_attrs = np.where(np.abs(correlation_matrix) > correlation_drop_threshold)
# highly_correlated_attrs

(array([ 0,  1,  2,  2,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,  4,  4,  5,
         5,  5,  5,  5,  5,  6,  7,  8,  8,  9,  9,  9, 10, 11, 12, 12, 12,
        13, 14, 14, 14, 15, 15, 15, 16, 17, 18, 19, 20, 21, 22, 22, 22, 22,
        22, 22, 23, 23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 26,
        27, 27, 28, 28, 29, 29, 30, 31]),
 array([ 0,  1,  2,  4,  5, 22, 24, 25,  3, 23,  2,  4,  5, 22, 24, 25,  2,
         4,  5, 22, 24, 25,  6,  7,  8,  9,  8,  9, 29, 10, 11, 12, 14, 15,
        13, 12, 14, 15, 12, 14, 15, 16, 17, 18, 19, 20, 21,  2,  4,  5, 22,
        24, 25,  3, 23,  2,  4,  5, 22, 24, 25,  2,  4,  5, 22, 24, 25, 26,
        27, 28, 27, 28,  9, 29, 30, 31]))

# Training-Test dataset split

We add 80% of values to the training and 20% to the test dataset

In [63]:
random_seed = 42
train_df, test_df = train_test_split(df, test_size=0.2, random_state=random_seed, stratify=df['class'])

# Drop ID and target attribute

In [64]:
x_train = train_df.drop(columns=['ID', 'class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['ID', 'class'])
y_test = test_df['class']

# Run MLPClassifier


In [66]:
mlp_seed = 42
mlp = MLPClassifier(max_iter=1000, random_state=mlp_seed)
mlp.fit(x_train, y_train)

y_pred = mlp.predict(x_test)

classification_report(y_test, y_pred)

# TODO: scaling
# TODO: removing highly correlated values
# TODO: attempt different parameters for MLPClassifier

'              precision    recall  f1-score   support\n\n       False       0.95      0.92      0.93        38\n        True       0.85      0.89      0.87        19\n\n    accuracy                           0.91        57\n   macro avg       0.90      0.91      0.90        57\nweighted avg       0.91      0.91      0.91        57\n'