In [81]:
import numpy as np
from sklearn import svm, discriminant_analysis
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from itertools import combinations
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Ignoring warning
import warnings
warnings.filterwarnings("ignore")

In [82]:
# Loading the dataset
data = pd.read_csv("./breast-cancer.csv")
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [83]:
# dropping unnecessary columns
data.drop('id',axis=1,inplace=True)

In [84]:
data['diagnosis'] = (data['diagnosis'] == 'M').astype(int) #encode the label into 1/0

In [86]:
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [88]:
# Separating the features and class label
class_label = 'diagnosis'  # the label column is named 'diagnosis'
features = data.columns[1:]  # All columns except the first one (diagnosis)
i1 = 0  # Class 0
i2 = 1  # Class 1


In [89]:
# Extracting classes
C1 = data[data[class_label] == i1].loc[:, features].values
C2 = data[data[class_label] == i2].loc[:, features].values

In [90]:
G = C1  # Class G
F = C2  # Class F

In [91]:
print('************   seeds  ***************')
print('SAMPLE SIZE : ')
print(f'  # OBJECTS : {len(G) + len(F)}')
print(f'  # FEATURES: {G.shape[1]}')

print(f' BALANCE : {len(G) / len(F)}')


************   seeds  ***************
SAMPLE SIZE : 
  # OBJECTS : 569
  # FEATURES: 30
 BALANCE : 1.6839622641509433


In [92]:
# Random permutation of objects in data
PP = 1
if PP == 1:
    G = G[np.random.permutation(len(G))]
    F = F[np.random.permutation(len(F))]
    print('within random permutation of objects')
else:
    print('without random permutation of objects')

within random permutation of objects


In [93]:
# Normalization to mean=0 and standard deviation=1 with respect to class G
NN = 0
if NN == 1:
    print('with normalisation')
    scaler = StandardScaler().fit(G)
    G = scaler.transform(G)
    F = scaler.transform(F)
else:
    print('without normalisation')

without normalisation


In [94]:
# Function to calculate error rate
def ERRS(G, F, classifier):
    G_pred = classifier.predict(G)
    F_pred = classifier.predict(F)
    return (1 - accuracy_score([0] * len(G), G_pred)) * 100, (1 - accuracy_score([1] * len(F), F_pred)) * 100


In [95]:
# # Function to train the classifier
# def parameters(G, F, type='SVM'):
#     if type == 'SVM':
#         classifier = svm.SVC(kernel='linear')
#         X = np.vstack((G, F))
#         y = np.array([0] * len(G) + [1] * len(F))
#         classifier.fit(X, y)
#         return classifier

In [96]:
def parameters(train_G, train_F, model_type):
    if model_type == 'SVM':
        classifier = svm.SVC(kernel='linear')
    elif model_type == 'LDA':
        classifier = LDA()
    else:
        raise ValueError("Unsupported model type")

    X_train = np.vstack((train_G, train_F))
    y_train = np.hstack((np.zeros(train_G.shape[0]), np.ones(train_F.shape[0])))
    classifier.fit(X_train, y_train)
    
    return classifier

In [97]:
print('Refinement in d-1 dimensional space')
type = 'SVM'
print(f'Initial classifier: {type}')
classifier_ini = parameters(G, F, type)
initial_accuracy = ERRS(G, F, classifier_ini)[0]
print(f'Accuracy: {initial_accuracy}%')

Refinement in d-1 dimensional space
Initial classifier: SVM
Accuracy: 2.2408963585434205%


In [98]:
# Generating all possible combinations of k elements from features
k = 2
P = list(combinations(range(G.shape[1]), k))

In [99]:
type = 'SVM'
print(f'Refinement classifier: {type}')
RESULTS = []

Refinement classifier: SVM


In [100]:
for combo in P[5:15]:
    I1 = list(combo)
    I2 = [i for i in range(G.shape[1]) if i not in I1]

    # Dimensionality reduction
    REF_G = np.hstack((np.sum(G[:, I1] * classifier_ini.coef_[0, I1], axis=1).reshape(-1, 1), G[:, I2]))
    REF_F = np.hstack((np.sum(F[:, I1] * classifier_ini.coef_[0, I1], axis=1).reshape(-1, 1), F[:, I2]))

    classifier = parameters(REF_G, REF_F, type)
    err_G, err_F = ERRS(REF_G, REF_F, classifier)
    RESULTS.append((*combo, err_G, err_F))


In [101]:
# Converting RESULTS to DataFrame for better visualization
# Set option to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

results_df = pd.DataFrame(RESULTS, columns=['Feature 1', 'Feature 2', 'Error G', 'Error F'])
print(results_df)

   Feature 1  Feature 2   Error G   Error F
0          0          6  1.960784  5.660377
1          0          7  1.680672  5.660377
2          0          8  1.960784  6.603774
3          0          9  2.240896  5.660377
4          0         10  2.240896  5.660377
5          0         11  1.960784  7.075472
6          0         12  1.960784  5.188679
7          0         13  1.680672  5.660377
8          0         14  1.960784  6.603774
9          0         15  2.240896  5.660377


## **Comparing the generalization abilities of initial classifier and refinement** 

In [102]:
# Holdout Method 50/50%
n = 2  # 50/50 split
i1 = round(len(G) / n)
i2 = round(len(F) / n)

test_G = G[:i1, :]
train_G = G[i1:, :]
test_F = F[:i2, :]
train_F = F[i2:, :]

In [103]:
# Initial classifier
type_initial = 'SVM'
print(f'Initial classifier: {type_initial}')
classifier = parameters(train_G, train_F, type_initial)
print(f'Error of TEST (initial): {ERRS(test_G, test_F, classifier)}%')

Initial classifier: SVM
Error of TEST (initial): (5.0561797752809, 5.660377358490565)%


In [104]:
# Fusion and Refinement
feature_dimension = G.shape[1]

P = np.array(list(combinations(range(feature_dimension), 2)))
FUSION = 4

In [105]:
print(f'Fusion of features: {P[FUSION, :]}')
I1 = P[FUSION, :]
I2 = np.setdiff1d(np.arange(G.shape[1]), I1)

Fusion of features: [0 5]


In [106]:
REF_G = np.hstack((np.sum(G[:, I1] * classifier.coef_[0, I1], axis=1).reshape(-1, 1), G[:, I2]))
REF_F = np.hstack((np.sum(F[:, I1] * classifier.coef_[0, I1], axis=1).reshape(-1, 1), F[:, I2]))


In [107]:
test_REF_G = REF_G[:i1, :]
train_REF_G = REF_G[i1:, :]
test_REF_F = REF_F[:i2, :]
train_REF_F = REF_F[i2:, :]

In [108]:
type_refinement = 'LDA'
print(f'Refinement classifier: {type_refinement}')
classifier = parameters(train_REF_G, train_REF_F, type_refinement)
print(f'Error of TEST (refinement): {ERRS(test_REF_G, test_REF_F, classifier)}%')


Refinement classifier: LDA
Error of TEST (refinement): (1.6853932584269704, 10.377358490566035)%


## **K-fold cross validation**

In [109]:
# k-fold Cross Validation
k = 5
print(f'k = {k}')

k = 5


In [110]:
kf_G = KFold(n_splits=k, shuffle=True, random_state=42)
kf_F = KFold(n_splits=k, shuffle=True, random_state=42)


In [111]:
# Initial classifier using k-fold
type_initial = 'SVM'
print(f'Initial classifier: {type_initial}')
results = []

Initial classifier: SVM


In [112]:
for (train_index_G, test_index_G), (train_index_F, test_index_F) in zip(kf_G.split(G), kf_F.split(F)):
    train_G, test_G = G[train_index_G], G[test_index_G]
    train_F, test_F = F[train_index_F], F[test_index_F]
    
    classifier = parameters(train_G, train_F, type_initial)
    results.append(ERRS(test_G, test_F, classifier))

In [113]:
print(f'ACCURACY: {np.mean(results)}%')
print(f'DEVIATION: {np.std(results)}%')


ACCURACY: 6.0206523377994055%
DEVIATION: 5.903274106334933%


In [114]:
# Refinement classifier using k-fold
type_refinement = 'LDA'
print(f'Refinement classifier: {type_refinement}')
results = []

Refinement classifier: LDA


In [115]:
for (train_index_G, test_index_G), (train_index_F, test_index_F) in zip(kf_G.split(G), kf_F.split(F)):
    train_G, test_G = G[train_index_G], G[test_index_G]
    train_F, test_F = F[train_index_F], F[test_index_F]
    
    classifier = parameters(train_G, train_F, type_refinement)
    I1 = P[FUSION, :]
    I2 = np.setdiff1d(np.arange(G.shape[1]), I1)

    train_REF_G = np.hstack((np.sum(train_G[:, I1] * classifier.coef_[0, I1], axis=1).reshape(-1, 1), train_G[:, I2]))
    train_REF_F = np.hstack((np.sum(train_F[:, I1] * classifier.coef_[0, I1], axis=1).reshape(-1, 1), train_F[:, I2]))

    test_REF_G = np.hstack((np.sum(test_G[:, I1] * classifier.coef_[0, I1], axis=1).reshape(-1, 1), test_G[:, I2]))
    test_REF_F = np.hstack((np.sum(test_F[:, I1] * classifier.coef_[0, I1], axis=1).reshape(-1, 1), test_F[:, I2]))

    classifier = parameters(train_REF_G, train_REF_F, type_refinement)
    results.append(ERRS(test_REF_G, test_REF_F, classifier))


In [116]:
print(f'ACCURACY: {np.mean(results)}%')
print(f'DEVIATION: {np.std(results)}%')

ACCURACY: 5.702509631432003%
DEVIATION: 6.472988941729058%
