# Basic Solutions: Testing Decision Trees, GNB, Neural Network, Non-Robust SVM

### Import Data and split into Training and Test sets

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
ds = pd.read_csv('processed_data.csv')
ds

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,...,1.886690,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.243890,0.281190
2,1,1.579888,0.456187,1.566503,1.558884,0.942210,1.052926,1.363478,2.037231,0.939685,...,1.511870,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955000,1.152255,0.201391
3,1,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.935010
4,1,1.750297,-1.151816,1.776573,1.826229,0.280372,0.539340,1.371011,1.428493,-0.009560,...,1.298575,-1.466770,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.397100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,2.110995,0.721473,2.060786,2.343856,1.041842,0.219060,1.947285,2.320965,-0.312589,...,1.901185,0.117700,1.752563,2.015301,0.378365,-0.273318,0.664512,1.629151,-1.360158,-0.709091
565,1,1.704854,2.085134,1.615931,1.723842,0.102458,-0.017833,0.693043,1.263669,-0.217664,...,1.536720,2.047399,1.421940,1.494959,-0.691230,-0.394820,0.236573,0.733827,-0.531855,-0.973978
566,1,0.702284,2.045574,0.672676,0.577953,-0.840484,-0.038680,0.046588,0.105777,-0.809117,...,0.561361,1.374854,0.579001,0.427906,-0.809587,0.350735,0.326767,0.414069,-1.104549,-0.318409
567,1,1.838341,2.336457,1.982524,1.735218,1.525767,3.272144,3.296944,2.658866,2.137194,...,1.961239,2.237926,2.303601,1.653171,1.430427,3.904848,3.197605,2.289985,1.919083,2.219635


In [3]:
# split into features and classification
X = ds.drop('diagnosis', axis=1)
y = ds['diagnosis']
print(X.shape, y.shape)

(569, 25) (569,)


In [4]:
# split into training and test set: split value 0.8
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
num_training_items = X_train.shape[0]
num_test_items = X_test.shape[0]

### Method 1: Decision Tree

As there are not many training items, we restrict the minimum number of samples at each leaf to avoid overfitting to 2% of the training items.

In [5]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=round(0.02*num_training_items))
tree.fit(X_train,y_train)
print(f'Decision Tree Accuracy: {tree.score(X_test,y_test)}')

DecisionTreeClassifier(criterion='entropy', min_samples_leaf=9)

Decision Tree Accuracy: 0.9649122807017544


### Method 2: Naive Bayes

The Naive Bayes classifiers that we want to test is the Gaussian, since our data is normalised and real. Since Naive Bayes classifiers have the assumption of independence of features, we need to be careful. 

The features for this problem contain a mixture of means, standard errors, and extremal (worst-case) measurements. For example, the features contain the radius mean, standard deviation, and extremal values. Clearly, these three features will not fall under the NB assumption of independence, as they are three different kinds of measures of the same fundamental feature. 

In [6]:
ds.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'radius_se', 'perimeter_se',
       'area_se', 'compactness_se', 'concavity_se', 'concave points_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

As we have 9 mean, 6 standard-error, and 10 extremal measurements, we will only keep the extremal measurements. That way, we will have observations for each fundamental feature, all of the same kind, and do not lose any fundamental features. 

In [7]:
features_to_keep = ['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', \
                  'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
X_train_NB = X_train[features_to_keep]
X_test_NB = X_test[features_to_keep]

In [8]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_NB,y_train)
print(f'Gaussian NB Accuracy: {gnb.score(X_test_NB,y_test)}')

GaussianNB()

Gaussian NB Accuracy: 0.9473684210526315


### Method 3: Neural Network

We implement a neural network via the method `MLPClassifier` provided by `sklearn`. The classifier is a Multi-Layered Perceptron, with loss function given by the Cross Entropy loss function. We use a sigmoid (logistic) activation at the output (as our prediction should be 0 or 1), and a tanh activation at the hidden layers. There are two hidden layers in the model: for input of size `num_features`, the first hidden layer is twice the size, and the second is 4 times the size of the input. Finally, we use an adaptive learning rate (i.e. decreases with training), initialised at 0.05. 

In [9]:
from sklearn.neural_network import MLPClassifier

num_features = X_train.shape[1]
nn = MLPClassifier(hidden_layer_sizes=(2*num_features,4*num_features), activation='tanh', \
                   solver='sgd', learning_rate='adaptive', learning_rate_init=0.01)
nn.fit(X_train,y_train)
print(f'Neural Network Accuracy: {nn.score(X_test,y_test)}')



MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 100),
              learning_rate='adaptive', learning_rate_init=0.01, solver='sgd')

Neural Network Accuracy: 0.9649122807017544


### Method 4: Simplified SVM

For our base Support Vector Machine implementation, we have chosen a sigmoidal kernel function, as this is related to the tanh-activated hidden layers in the neural network, and so makes the two models comparable (more precisely a sigmoidal kernel function implements a classifier that is equivalent to a two-layer perceptron with tanh activation). 

First, we need to determine the regularization parameter `C` via a grid search. 

In [22]:
regs = np.arange(0,1.01,0.05)
regs[0] = 0.01
best_acc = 0.0
best_r = 0
for r in regs:
    svm = SVC(C=r, kernel='sigmoid', gamma='scale')
    _ = svm.fit(X_train, y_train)
    acc = svm.score(X_test, y_test)
    if acc > best_acc:
        best_acc = acc
        best_r = r
print(f'Highest Accuracy of {best_acc} achieved by r = {best_r}')

Highest Accuracy of 0.9824561403508771 achieved by r = 0.25


In [13]:
from sklearn.svm import SVC

svm = SVC(C=0.25, kernel='sigmoid', gamma='scale')
svm.fit(X_train, y_train)
print(f'Support Vector Machine Accuracy: {svm.score(X_test, y_test)}')

SVC(C=0.25, kernel='sigmoid')

Support Vector Machine Accuracy: 0.9824561403508771


### Conclusion

From this simple set up, we can see that the SVM performs best out of our four methods. This justifies the further investigation into different formulations for SVMs, and their ability to maintain powerful classification under noisy data and uncertainty. This will be assessed in the following section. 