The main aim of this notebook is to find the best model which can predict seperability of a given plot.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score 
from sklearn.utils import resample
from sklearn.metrics import roc_curve,auc
from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("RESULTS_EUROVIS2015.csv");

AUC_scores=df.tail(1);

main_df=df[:-1];
number_of_rows=main_df.shape[0];
print("Total number of rows in original dataframe:"+str(len(df.index)));
print("Total number of rows in modified dataframe:"+str(len(main_df.index)));


Total number of rows in original dataframe:829
Total number of rows in modified dataframe:828


In [3]:
main_df.describe()

Unnamed: 0,indexFile,indexDR,classNum,scoreA,scoreM,scoreDes,DSC,CAL,HM,LDA,...,DC 20%,HDM 80,DC 2%,DUNN,DC 1%,DC 0.5%,CDM 1,DC 0.2%,DC 0.1%,AWTN
count,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0,...,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0
mean,37.751208,2.450483,3.52657,2.979469,2.949275,0.507246,77.429402,80.298799,82.342098,0.594322,...,63.634851,98.403195,96.553164,0.027152,99.459394,99.935704,87.92773,99.996095,99.999158,270.766188
std,20.766822,1.115446,2.54403,1.631259,1.749717,0.50025,14.363301,118.300177,10.342507,1.481028,...,19.729777,3.748677,4.95379,0.059874,1.121827,0.185837,7.215781,0.019771,0.005084,78.268813
min,1.0,1.0,1.0,0.0,0.0,0.0,29.010239,0.007398,47.360794,2e-06,...,5.374185,53.901868,57.287289,0.000891,87.332331,97.799476,39.832298,99.711761,99.925354,38.993885
25%,16.0,1.0,2.0,1.0,1.0,0.0,66.65,3.798646,75.515331,0.012565,...,52.270991,98.247322,95.165531,0.003489,99.374314,99.944297,84.619306,99.999112,100.0,235.497898
50%,40.0,3.0,3.0,3.0,3.0,1.0,79.33526,24.730365,83.765962,0.09118,...,64.001399,100.0,98.675758,0.007414,99.87535,99.992992,89.100684,100.0,100.0,286.045767
75%,55.0,3.0,5.0,5.0,5.0,1.0,89.45,113.261177,90.789427,0.533361,...,76.557142,100.0,99.852436,0.026669,100.0,100.0,92.874167,100.0,100.0,324.972774
max,70.0,4.0,13.0,5.0,5.0,1.0,100.0,1038.602336,99.005716,25.837629,...,100.0,100.0,100.0,0.707247,100.0,100.0,99.310559,100.0,100.0,457.396744


In [4]:
main_df.columns

Index(['fileName', 'indexFile', 'indexDR', 'classNum', 'scoreA', 'scoreM',
       'scoreDes', 'DSC', 'CAL', 'HM', 'LDA', 'WII', 'SIL', 'GAM', 'ABW', 'CS',
       'HDM 10', 'ABTN', 'HDM 5', 'CDM 4', 'CDM 3', 'DC 10%', 'CDM 5', 'CDM 2',
       'CDM 6', 'CDM 7', 'DC 5%', 'CDM 8', 'CDM 10', 'CDM 9', 'HDM 20',
       'HDM 40', 'DC 20%', 'HDM 80', 'DC 2%', 'DUNN', 'DC 1%', 'DC 0.5%',
       'CDM 1', 'DC 0.2%', 'DC 0.1%', 'AWTN'],
      dtype='object')

In [5]:
#rerranging the columms in the order given in the paper.
target_var='scoreDes'
file_descriptors=['fileName', 'indexFile', 'indexDR', 'classNum', 'scoreA', 'scoreM']+[target_var]


measure_cols=['ABTN','AWTN','ABW','WII','CAL','LDA','DUNN','GAM','SIL','HM','CS','DSC',
            'CDM 1','CDM 2','CDM 3','CDM 4','CDM 5', 'CDM 6', 'CDM 7', 'CDM 8', 'CDM 9', 'CDM 10',
           'DC 0.1%','DC 0.2%','DC 0.5%','DC 1%','DC 2%','DC 5%','DC 10%','DC 20%',
           'HDM 5','HDM 10','HDM 20','HDM 40','HDM 80']

new_cols=file_descriptors+measure_cols;

X_cols=measure_cols;
Y_cols=target_var;

X=main_df[X_cols];
Y=main_df[target_var];

bootstrap_sample_size=10000

In [6]:
mod_auc_scores=AUC_scores[X_cols];
actual_auc_scores=mod_auc_scores.values[0]
print(actual_auc_scores)

[0.75796  0.524822 0.77543  0.802286 0.812012 0.807314 0.632263 0.791216
 0.798531 0.811283 0.767223 0.825445 0.617904 0.73181  0.754523 0.755598
 0.736352 0.725799 0.719329 0.711536 0.710013 0.711477 0.563681 0.604139
 0.621176 0.625403 0.640985 0.71753  0.748782 0.655484 0.755787 0.763379
 0.707406 0.69294  0.655272]


In [7]:
#preprocessing 

scaler = StandardScaler().fit(X) 
X = scaler.transform(X) 
X=np.hstack((np.ones((X.shape[0], 1)), X))

In [8]:
print(X[:,[0,2]]);

[[1.         0.76738343]
 [1.         0.72497195]
 [1.         0.72684856]
 ...
 [1.         0.61471461]
 [1.         1.44993053]
 [1.         1.40346949]]


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.05)

def get_train_test_accuracy(model):
    #model = LogisticRegression()
    model.fit(X_train, Y_train)
    predicted_classes = model.predict(X_train)
    train_accuracy = accuracy_score(Y_train,predicted_classes)
    predicted_classes = model.predict(X_test)
    test_accuracy = accuracy_score(Y_test,predicted_classes)
    return train_accuracy,test_accuracy

print(get_train_test_accuracy(LogisticRegression()))
print(get_train_test_accuracy(RandomForestClassifier()))


(0.8969465648854962, 0.8333333333333334)
(0.9974554707379135, 0.9047619047619048)


In [10]:
# model=LogisticRegression()


def get_auc_scores(main_df,model):
    X_cols=measure_cols;
    Y_cols=target_var;

    X=main_df[X_cols];
    Y=main_df[target_var];
    
    scaler = StandardScaler().fit(X) 
    X = scaler.transform(X) 
    X=np.hstack((np.ones((X.shape[0], 1)), X))
    
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25)
    auc_vals=[]
    for i in range(1,X_train.shape[1]):
        model.fit(X_train[:,[i]],Y_train);
        preds=model.predict(X_test[:,[i]]);
        test_accuracy = accuracy_score(Y_test,preds);
        fpr, tpr, thresholds = roc_curve(Y_test, preds)
        auc_val=auc(fpr, tpr)
        auc_vals.append(auc_val)
    return auc_vals;


def get_bootstrapped_auc_scores(main_df,model):
    auc_scores=[]
    for i in range(bootstrap_sample_size):
        if i%1000==0:
            print("Iteration :"+str(i));
        sample_df=resample(main_df, replace=False, n_samples=main_df.shape[0], random_state=1)
        auc_scores.append(get_auc_scores(sample_df,model));
    return auc_scores;
    


In [11]:
model=LogisticRegression()
auc_scores=get_bootstrapped_auc_scores(main_df,model);
calculated_AUC_scores=np.mean(auc_scores,axis=0);
output = pd.DataFrame({'Measure': X_cols, 'Expected': actual_auc_scores,'Calculated':calculated_AUC_scores})
output.sort_values("Calculated", inplace=True,ascending=False)
print("Calculated bootstrapped scores using Logistic Regression model")
print(output.head(10))

Iteration :0
Iteration :1000
Iteration :2000
Iteration :3000
Iteration :4000
Iteration :5000
Iteration :6000
Iteration :7000
Iteration :8000
Iteration :9000
Calculated bootstrapped scores using Logistic Regression model
   Measure  Expected  Calculated
3      WII  0.802286    0.785617
8      SIL  0.798531    0.773584
9       HM  0.811283    0.769793
7      GAM  0.791216    0.768793
2      ABW  0.775430    0.763625
11     DSC  0.825445    0.760098
5      LDA  0.807314    0.742275
4      CAL  0.812012    0.741578
10      CS  0.767223    0.720586
15   CDM 4  0.755598    0.719959


In [12]:
model=RandomForestClassifier(n_estimators=4, max_depth=2, random_state=0)

auc_scores=get_bootstrapped_auc_scores(main_df,model);
calculated_AUC_scores=np.mean(auc_scores,axis=0);
output = pd.DataFrame({'Measure': X_cols, 'Expected': actual_auc_scores,'Calculated':calculated_AUC_scores})
output.sort_values("Calculated", inplace=True,ascending=False)
print("Calculated bootstrapped scores using Random Forest Classifier model")
print(output.head(10))

Iteration :0
Iteration :1000
Iteration :2000
Iteration :3000
Iteration :4000
Iteration :5000
Iteration :6000
Iteration :7000
Iteration :8000
Iteration :9000
Calculated bootstrapped scores using Random Forest Classifier model
   Measure  Expected  Calculated
11     DSC  0.825445    0.794862
3      WII  0.802286    0.787084
8      SIL  0.798531    0.781127
9       HM  0.811283    0.778428
7      GAM  0.791216    0.771667
2      ABW  0.775430    0.764788
4      CAL  0.812012    0.734563
5      LDA  0.807314    0.733968
16   CDM 5  0.736352    0.727876
14   CDM 3  0.754523    0.724601


In [13]:
#decision tree classifier
model = DecisionTreeClassifier(random_state=0)
auc_scores=get_bootstrapped_auc_scores(main_df,model);
calculated_AUC_scores=np.mean(auc_scores,axis=0);
output = pd.DataFrame({'Measure': X_cols, 'Expected': actual_auc_scores,'Calculated':calculated_AUC_scores})
output.sort_values("Calculated", inplace=True,ascending=False)
print("Calculated bootstrapped scores using Decision Tree classifier model")
print(output.head(10))

Iteration :0
Iteration :1000
Iteration :2000
Iteration :3000
Iteration :4000
Iteration :5000
Iteration :6000
Iteration :7000
Iteration :8000
Iteration :9000
Calculated bootstrapped scores using Decision Tree classifier model
   Measure  Expected  Calculated
11     DSC  0.825445    0.725536
8      SIL  0.798531    0.701893
7      GAM  0.791216    0.686998
3      WII  0.802286    0.679930
5      LDA  0.807314    0.669294
33  HDM 40  0.692940    0.664318
2      ABW  0.775430    0.660609
16   CDM 5  0.736352    0.660011
30   HDM 5  0.755787    0.659796
34  HDM 80  0.655272    0.656517


In [14]:
#support vector machine
model = SVC(gamma='auto')
auc_scores=get_bootstrapped_auc_scores(main_df,model);
calculated_AUC_scores=np.mean(auc_scores,axis=0);
output = pd.DataFrame({'Measure': X_cols, 'Expected': actual_auc_scores,'Calculated':calculated_AUC_scores})
output.sort_values("Calculated", inplace=True,ascending=False)
print("Calculated bootstrapped scores using SVM model")
print(output.head(10))

Iteration :0
Iteration :1000
Iteration :2000
Iteration :3000
Iteration :4000
Iteration :5000
Iteration :6000
Iteration :7000
Iteration :8000
Iteration :9000
Calculated bootstrapped scores using SVM model
   Measure  Expected  Calculated
11     DSC  0.825445    0.793622
3      WII  0.802286    0.789925
8      SIL  0.798531    0.784628
9       HM  0.811283    0.778317
7      GAM  0.791216    0.775892
2      ABW  0.775430    0.768375
5      LDA  0.807314    0.740721
4      CAL  0.812012    0.738146
15   CDM 4  0.755598    0.727242
16   CDM 5  0.736352    0.727137


In [15]:
#knn model
model = KNeighborsClassifier(n_neighbors=3)
auc_scores=get_bootstrapped_auc_scores(main_df,model);
calculated_AUC_scores=np.mean(auc_scores,axis=0);
output = pd.DataFrame({'Measure': X_cols, 'Expected': actual_auc_scores,'Calculated':calculated_AUC_scores})
output.sort_values("Calculated", inplace=True,ascending=False)
print("Calculated bootstrapped scores using K-Nearest Neighbours model")
print(output.head(10))

Iteration :0
Iteration :1000
Iteration :2000
Iteration :3000
Iteration :4000
Iteration :5000
Iteration :6000
Iteration :7000
Iteration :8000
Iteration :9000
Calculated bootstrapped scores using K-Nearest Neighbours model
   Measure  Expected  Calculated
11     DSC  0.825445    0.749985
3      WII  0.802286    0.733166
8      SIL  0.798531    0.730018
7      GAM  0.791216    0.707529
9       HM  0.811283    0.707083
2      ABW  0.775430    0.705599
5      LDA  0.807314    0.682372
19   CDM 8  0.711536    0.680672
4      CAL  0.812012    0.678046
20   CDM 9  0.710013    0.676344


In [16]:
model = AdaBoostClassifier(n_estimators=4,learning_rate=1,random_state=0)
auc_scores=get_bootstrapped_auc_scores(main_df,model);
calculated_AUC_scores=np.mean(auc_scores,axis=0);
output = pd.DataFrame({'Measure': X_cols, 'Expected': actual_auc_scores,'Calculated':calculated_AUC_scores})
output.sort_values("Calculated", inplace=True,ascending=False)
print("Calculated bootstrapped scores using Adaboost Classifier model")
print(output.head(10))

Iteration :0
Iteration :1000
Iteration :2000
Iteration :3000
Iteration :4000
Iteration :5000
Iteration :6000
Iteration :7000
Iteration :8000
Iteration :9000
Calculated bootstrapped scores using Adaboost Classifier model
   Measure  Expected  Calculated
11     DSC  0.825445    0.795037
3      WII  0.802286    0.785710
8      SIL  0.798531    0.782777
9       HM  0.811283    0.780388
7      GAM  0.791216    0.771324
2      ABW  0.775430    0.765793
4      CAL  0.812012    0.731656
14   CDM 3  0.754523    0.729641
5      LDA  0.807314    0.726975
16   CDM 5  0.736352    0.726087
