In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score,confusion_matrix,ConfusionMatrixDisplay,accuracy_score, roc_curve, auc, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
columns = ['Id', 'Diagnosis']
features = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'concave points', 'symmetry','fractal dimension']
measure = ['mean', 'std', 'worst']

In [4]:
for i in measure:
    for j in features:
        columns.append(i+'_'+j)
len(columns)

32

In [5]:
df = pd.read_csv("../data/wdbc.data",names = columns, header = None, sep = ',')
df.head(3)

Unnamed: 0,Id,Diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave points,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave points,worst_symmetry,worst_fractal dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [6]:
df.shape

(569, 32)

In [7]:
finalfeatures = columns.copy()
finalfeatures.remove('Id')
finalfeatures.remove('Diagnosis')

In [8]:
ms = preprocessing.MinMaxScaler()
newdf = ms.fit_transform(df[finalfeatures])

In [9]:
newdf = pd.DataFrame(newdf, columns=finalfeatures)
newdf = pd.concat([newdf, df['Diagnosis']], axis=1)

In [10]:
newdf = newdf.replace("B", 0) 
newdf = newdf.replace("M", 1)

In [11]:
malignant = newdf[newdf['Diagnosis'] == 1]
benign = newdf[newdf['Diagnosis'] == 0]

In [12]:
import operator

In [13]:
final_report3 = [['Simulation','Train Accuracy','Train Precision','Train Recall', 'Train F1score', 'Train AUC', 'Test Accuracy','Test Precision','Test Recall', 'Test F1score', 'Test AUC']]

In [14]:
for i in range(1,31):
    train_m, test_m = train_test_split(malignant, test_size = 0.2)
    train_b, test_b = train_test_split(benign, test_size = 0.2)
    train = pd.concat([train_m,train_b], axis = 0)
    test = pd.concat([test_m,test_b], axis = 0)
    temp = train.copy()
    new_train = train.copy()
    
    km = KMeans(n_clusters = 2, init = 'random', n_init = 20)
    km.fit(train[finalfeatures])
 
    cluster_distance = euclidean_distances(train[finalfeatures], km.cluster_centers_)
    list_distance = cluster_distance.tolist()
    cl1 = []
    cl2 = []
    for x in list_distance:
        cl1.append(x[0])
        cl2.append(x[1])
        
    temp['cl1'] = cl1
    temp['cl2'] = cl2
    
    newtemp1 = temp.sort_values(by='cl1').iloc[:30]
    newtemp2 = temp.sort_values(by='cl2').iloc[:30]
    label1 = newtemp1['Diagnosis'].value_counts().index[0]
    label2 = newtemp2['Diagnosis'].value_counts().index[0]
    
    new_train['Assigned_Cluster'] = km.labels_
    new_train['Assigned_Diagnosis'] = ""
    for k in range(len(new_train)):
        if new_train['Assigned_Cluster'].iloc[k] == 0:
            new_train['Assigned_Diagnosis'].iloc[k] = label1
        else:
            new_train['Assigned_Diagnosis'].iloc[k] = label2 
    new_train['Assigned_Diagnosis'] = new_train['Assigned_Diagnosis'].astype(str).astype(int)
    
    
    new_test = test.copy()
    cluster_distance = euclidean_distances(test[finalfeatures], km.cluster_centers_)
    list_distance = cluster_distance.tolist()
    cl1 = []
    cl2 = []
    for x in list_distance:
        cl1.append(x[0])
        cl2.append(x[1])
        
    new_test['cl1'] = cl1
    new_test['cl2'] = cl2
    
    new_test['Assigned_Diagnosis'] = ""
    for k in range(len(new_test)):
        if new_test['cl1'].iloc[k] < new_test['cl2'].iloc[k]:
            new_test['Assigned_Diagnosis'].iloc[k] = label1
        else:
            new_test['Assigned_Diagnosis'].iloc[k] = label2
    new_test['Assigned_Diagnosis'] = new_test['Assigned_Diagnosis'].astype(str).astype(int)
    
    
    train_pred_y = new_train['Assigned_Diagnosis']
    test_pred_y = new_test['Assigned_Diagnosis']
    train_y = new_train['Diagnosis']
    test_y = new_test['Diagnosis']
    

    train_acc = accuracy_score(train_y, train_pred_y)
    train_precision = precision_score(train_y, train_pred_y)
    train_recall = recall_score(train_y, train_pred_y)
    train_f1score = f1_score(train_y, train_pred_y)
    train_auc = roc_auc_score(train_y, train_pred_y)
    
    test_acc = accuracy_score(test_y, test_pred_y)
    test_precision = precision_score(test_y, test_pred_y)
    test_recall = recall_score(test_y, test_pred_y)
    test_f1score = f1_score(test_y, test_pred_y)
    test_auc = roc_auc_score(test_y, test_pred_y)
    
    final_report3.append([i,train_acc,train_precision,train_recall,train_f1score,train_auc,test_acc,test_precision ,test_recall,test_f1score,test_auc])
    print("Simulation: ", i)
    print('Train Accuracy: ',train_acc,', Train Precision: ',train_precision,', Train Recall: ',train_recall, ', Train F1score: ', train_f1score,', Train AUC:',train_auc) 
    print('Test Accuracy: ',test_acc,', Test Precision: ',test_precision,', Test Recall: ',test_recall, ', Test F1score: ', test_f1score,', Test AUC:',test_auc) 
    print()
    
    



Simulation:  1
Train Accuracy:  0.920704845814978 , Train Precision:  0.9463087248322147 , Train Recall:  0.834319526627219 , Train F1score:  0.8867924528301887 , Train AUC: 0.9031246755943113
Test Accuracy:  0.9478260869565217 , Test Precision:  0.9512195121951219 , Test Recall:  0.9069767441860465 , Test F1score:  0.9285714285714286 , Test AUC: 0.9395994832041343

Simulation:  2
Train Accuracy:  0.9229074889867841 , Train Precision:  0.9466666666666667 , Train Recall:  0.8402366863905325 , Train F1score:  0.890282131661442 , Train AUC: 0.9060832554759679
Test Accuracy:  0.9478260869565217 , Test Precision:  0.9743589743589743 , Test Recall:  0.8837209302325582 , Test F1score:  0.9268292682926831 , Test AUC: 0.9349160206718348

Simulation:  3
Train Accuracy:  0.9140969162995595 , Train Precision:  0.9452054794520548 , Train Recall:  0.8165680473372781 , Train F1score:  0.8761904761904762 , Train AUC: 0.8942489359493407
Test Accuracy:  0.9565217391304348 , Test Precision:  0.975 , Test

Simulation:  24
Train Accuracy:  0.9317180616740088 , Train Precision:  0.9539473684210527 , Train Recall:  0.8579881656804734 , Train F1score:  0.9034267912772586 , Train AUC: 0.9167133810858509
Test Accuracy:  0.9130434782608695 , Test Precision:  0.9459459459459459 , Test Recall:  0.813953488372093 , Test F1score:  0.875 , Test AUC: 0.8930878552971576

Simulation:  25
Train Accuracy:  0.9317180616740088 , Train Precision:  0.9791666666666666 , Train Recall:  0.834319526627219 , Train F1score:  0.9009584664536742 , Train AUC: 0.9118966054188726
Test Accuracy:  0.9304347826086956 , Test Precision:  0.926829268292683 , Test Recall:  0.8837209302325582 , Test F1score:  0.9047619047619047 , Test AUC: 0.9210271317829458

Simulation:  26
Train Accuracy:  0.9273127753303965 , Train Precision:  0.9657534246575342 , Train Recall:  0.834319526627219 , Train F1score:  0.8952380952380953 , Train AUC: 0.9083878334890481
Test Accuracy:  0.9304347826086956 , Test Precision:  0.9069767441860465 , Te

In [None]:
ConfusionMatrixDisplay(confusion_matrix(train_y,  train_pred_y)).plot()

In [None]:
fpr, tpr, _ = roc_curve(train_y,  train_pred_y)        
plt.plot(fpr,tpr,color ='green', label='ROC with AUC: ' + str(roc_auc_score(train_y, train_pred_y)))
plt.plot([0, 1], [0, 1], color='grey', linestyle = 'dotted')
plt.xlabel('FALSE Positive RATE')
plt.ylabel('TRUE Positive RATE')
plt.title('ROC CURVE FOR TRAINING DATA')
plt.legend()
plt.show()

In [None]:
ConfusionMatrixDisplay(confusion_matrix(test_y,  test_pred_y)).plot()

In [None]:
fpr, tpr, _ = roc_curve(test_y,  test_pred_y)        
plt.plot(fpr,tpr,color ='green', label='ROC with AUC: ' + str(roc_auc_score(test_y, test_pred_y)))
plt.plot([0, 1], [0, 1], color='grey', linestyle = 'dotted')
plt.xlabel('FALSE Positive RATE')
plt.ylabel('TRUE Positive RATE')
plt.title('ROC CURVE FOR Testing data')
plt.legend()
plt.show()

In [None]:
reportdf3 = pd.DataFrame(final_report3[1:], columns = final_report3[0])
reportdf3

In [None]:
reportdf3.mean()

In [16]:
final_report4 = [['Simulation','Train Accuracy','Train Precision','Train Recall', 'Train F1score', 'Train AUC', 'Test Accuracy','Test Precision','Test Recall', 'Test F1score', 'Test AUC']]

In [19]:
for k in range(1,31):
    train_m, test_m = train_test_split(malignant, test_size = 0.2)
    train_b, test_b = train_test_split(benign, test_size = 0.2)
    train = pd.concat([train_m,train_b], axis = 0)
    test = pd.concat([test_m,test_b], axis = 0)
    new_train = train.copy()
    new_test = test.copy()
    
    sc = SpectralClustering(n_clusters=2, n_init=20, gamma=1.0, affinity='rbf')   
    new_train['Assigned_Cluster'] = sc.fit_predict(train[finalfeatures])
    
    for i in range (0,2):
        diagnosis = new_train[new_train.Assigned_Cluster == i]['Diagnosis'].value_counts().index[0]
        new_train.loc[new_train.Assigned_Cluster == i, 'Assigned_Diagnosis'] = diagnosis

    new_train['Assigned_Diagnosis'] = new_train['Assigned_Diagnosis'].astype(int)
    
    
    new_test['Assigned_Cluster'] = sc.fit_predict(test[finalfeatures])   
    for i in range (0,2):
        diagnosis = new_test[new_test.Assigned_Cluster == i]['Diagnosis'].value_counts().index[0]
        new_test.loc[new_test.Assigned_Cluster == i, 'Assigned_Diagnosis'] = diagnosis
    new_test['Assigned_Diagnosis'] = new_test['Assigned_Diagnosis'].astype(int)
    
    
    train_pred_y = new_train['Assigned_Diagnosis']
    test_pred_y = new_test['Assigned_Diagnosis']
    train_y = new_train['Diagnosis']
    test_y = new_test['Diagnosis']
    

    train_acc = accuracy_score(train_y, train_pred_y)
    train_precision = precision_score(train_y, train_pred_y)
    train_recall = recall_score(train_y, train_pred_y)
    train_f1score = f1_score(train_y, train_pred_y)
    train_auc = roc_auc_score(train_y, train_pred_y)
    
    test_acc = accuracy_score(test_y, test_pred_y)
    test_precision = precision_score(test_y, test_pred_y)
    test_recall = recall_score(test_y, test_pred_y)
    test_f1score = f1_score(test_y, test_pred_y)
    test_auc = roc_auc_score(test_y, test_pred_y)
    
    final_report4.append([k,train_acc,train_precision,train_recall,train_f1score,train_auc,test_acc,test_precision ,test_recall,test_f1score,test_auc])
    print("Simulation: ", k)
    print('Train Accuracy: ',train_acc,', Train Precision: ',train_precision,', Train Recall: ',train_recall, ', Train F1score: ', train_f1score,', Train AUC:',train_auc) 
    print('Test Accuracy: ',test_acc,', Test Precision: ',test_precision,', Test Recall: ',test_recall, ', Test F1score: ', test_f1score,', Test AUC:',test_auc) 
    print()
    
    

Simulation:  1
Train Accuracy:  0.8678414096916299 , Train Precision:  0.9823008849557522 , Train Recall:  0.6568047337278107 , Train F1score:  0.7872340425531915 , Train AUC: 0.8248935949340808
Test Accuracy:  0.8869565217391304 , Test Precision:  0.96875 , Test Recall:  0.7209302325581395 , Test F1score:  0.8266666666666667 , Test AUC: 0.8535206718346253

Simulation:  2
Train Accuracy:  0.8744493392070485 , Train Precision:  0.9745762711864406 , Train Recall:  0.6804733727810651 , Train F1score:  0.8013937282229965 , Train AUC: 0.8349735284957958
Test Accuracy:  0.782608695652174 , Test Precision:  1.0 , Test Recall:  0.4186046511627907 , Test F1score:  0.5901639344262295 , Test AUC: 0.7093023255813954

Simulation:  3
Train Accuracy:  0.8898678414096917 , Train Precision:  0.976 , Train Recall:  0.7218934911242604 , Train F1score:  0.8299319727891156 , Train AUC: 0.8556835876673934
Test Accuracy:  0.7391304347826086 , Test Precision:  1.0 , Test Recall:  0.3023255813953488 , Test F1s

Simulation:  25
Train Accuracy:  0.8480176211453745 , Train Precision:  0.9901960784313726 , Train Recall:  0.5976331360946746 , Train F1score:  0.7453874538745388 , Train AUC: 0.7970621820824251
Test Accuracy:  0.9304347826086956 , Test Precision:  0.972972972972973 , Test Recall:  0.8372093023255814 , Test F1score:  0.9 , Test AUC: 0.9116602067183464

Simulation:  26
Train Accuracy:  0.8524229074889867 , Train Precision:  0.9811320754716981 , Train Recall:  0.6153846153846154 , Train F1score:  0.7563636363636365 , Train AUC: 0.8041835357624831
Test Accuracy:  0.8782608695652174 , Test Precision:  0.967741935483871 , Test Recall:  0.6976744186046512 , Test F1score:  0.810810810810811 , Test AUC: 0.8418927648578811

Simulation:  27
Train Accuracy:  0.8524229074889867 , Train Precision:  0.9722222222222222 , Train Recall:  0.621301775147929 , Train F1score:  0.7581227436823104 , Train AUC: 0.8053877296792276
Test Accuracy:  0.8434782608695652 , Test Precision:  1.0 , Test Recall:  0.581