# Supervised Learning
### Benign and malignant cancer

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from statistics import mean

dataset = pd.read_csv("../data/Cancer_Data.csv")

if 'Unnamed: 32' in dataset.columns:
    dataset.drop('Unnamed: 32', axis=1, inplace=True)
dataset['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True) # B = 0, M = 1 

plt.figure(figsize=(20,20))
sns.heatmap(dataset.corr(),cbar=True,annot=True,cmap='Oranges')
print(dataset.corr())
plt.show()

dataset

In [None]:
from sklearn.model_selection import train_test_split

x = dataset.drop(['diagnosis','id'],axis=1)
y = dataset['diagnosis']



### Decision Tree

First, we will analyse the dataset with all information, and for that, we first need to create a decision tree.
Our first decision tree will be a default one, created by the scikit learn library.

In [None]:
from sklearn.tree import DecisionTreeClassifier

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf = DecisionTreeClassifier() 


Now, we send the decision tree our train sample.

In [None]:
clf.fit(x_train,y_train)

To make the predictions we call the function:

In [None]:

predictions = clf.predict(x_test)

dt_accuracy = accuracy_score(y_test, predictions)
dt_precision = precision_score(y_test, predictions)
dt_recall = recall_score(y_test, predictions)
dt_f1 = f1_score(y_test, predictions)

 (We also defined a function to plot the confusion matrix, which we will use later on). MUDAR ISTO PARA FAZER SENTIDO, TIVE QUE BAZAR E YA UELELEEEE

In [None]:
def show_plot(y_pred, accuracy, precision, recall, f1_score):

    cm = confusion_matrix(y_test, y_pred)
    plt.imshow(cm, cmap=plt.cm.Oranges)
    plt.title("Confusion Matrix\nAccuracy: {:.3f} - Precision: {:.3f} - Recall: {:.3f} - F1 Score: {:.3f}".format(accuracy, precision, recall, f1_score))
    plt.colorbar()
    plt.xlabel("Predicted Class")
    plt.ylabel("True Class")
    plt.xticks([0, 1], ["Benign", "Malignant"])
    plt.yticks([0, 1], ["Benign", "Malignant"])

    for i in range(2):
        for j in range(2):
            plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='black')

    plt.show()

In [None]:
show_plot(predictions,dt_accuracy,dt_precision,dt_recall,dt_f1)

With this code, we can see the importance given to each feature of the dataset by the decision tree.

In [None]:
feature_importance_worst = pd.DataFrame(clf.feature_importances_, index = x_test.columns).sort_values(0, ascending=False)
feature_importance_worst.columns = ['%']
feature_importance_worst

In our dataset, we have 3 type of data:
 - worst: worst value recorded from that person
 - se: standard error
 - mean: mean a list of values

To see if we can improve our accuracy and precision, we are going to try to separate these 3 types and we will run it with the same decision tree to see if our results improve.<br>

#### Dataset only with "Worst" values

In [None]:
x_worst_train = x_train.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])
x_worst_test = x_test.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])


clf.fit(x_worst_train,y_train)
predictions_worst = clf.predict(x_worst_test)
dt_worst_accuracy = accuracy_score(y_test, predictions_worst)
dt_worst_precision = precision_score(y_test, predictions_worst)
dt_worst_recall = recall_score(y_test, predictions_worst)
dt_worst_f1 = f1_score(y_test, predictions_worst)

In [None]:
show_plot(predictions_worst,dt_worst_accuracy,dt_worst_precision,dt_worst_recall,dt_worst_f1)

In [None]:
feature_importance_worst = pd.DataFrame(clf.feature_importances_, index = x_worst_test.columns).sort_values(0, ascending=False)
feature_importance_worst.columns = ['%']
feature_importance_worst

#### Dataset only with "Mean" values

In [None]:
x_mean_train = x_train.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'])
x_mean_test = x_test.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'])


clf.fit(x_mean_train,y_train)
predictions_mean = clf.predict(x_mean_test)
dt_mean_accuracy = accuracy_score(y_test, predictions_mean)
dt_mean_precision = precision_score(y_test, predictions_mean)
dt_mean_recall = recall_score(y_test, predictions_mean)
dt_mean_f1 = f1_score(y_test, predictions_mean)

In [None]:
show_plot(predictions_mean,dt_mean_accuracy,dt_mean_precision,dt_mean_recall,dt_mean_f1)

In [None]:
feature_importance_mean = pd.DataFrame(clf.feature_importances_, index = x_mean_test.columns).sort_values(0, ascending=False)
feature_importance_mean.columns = ['%']
feature_importance_mean

#### Dataset only with "SE" values

In [None]:
x_se_train = x_train.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'])
x_se_test = x_test.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'])


clf.fit(x_se_train,y_train)
predictions_se = clf.predict(x_se_test)
dt_se_accuracy = accuracy_score(y_test, predictions_se)
dt_se_precision = precision_score(y_test, predictions_se)
dt_se_recall = recall_score(y_test, predictions_se)
dt_se_f1 = f1_score(y_test, predictions_se)

In [None]:
show_plot(predictions_se,dt_se_accuracy,dt_se_precision,dt_se_recall,dt_se_f1)

In [None]:
feature_importance_se = pd.DataFrame(clf.feature_importances_, index = x_se_test.columns).sort_values(0, ascending=False)
feature_importance_se.columns = ['%']
feature_importance_se

To examine which dataset gives the best results, we will run each dataset 100 times and compare its results

In [None]:
dt_accuracy_mean = []
dt_precision_mean = []
dt_recall_mean = []
dt_f1_mean = []

dt_worst_accuracy_mean = []
dt_worst_precision_mean = []
dt_worst_recall_mean = []
dt_worst_f1_mean = []

dt_mean_accuracy_mean = []
dt_mean_precision_mean = []
dt_mean_recall_mean = []
dt_mean_f1_mean = []

dt_se_accuracy_mean = []
dt_se_precision_mean = []
dt_se_recall_mean = []
dt_se_f1_mean = []

for i in range(100):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    clf = DecisionTreeClassifier()
    clf.fit(x_train,y_train)
    predictions = clf.predict(x_test)

    dt_accuracy_mean.append(accuracy_score(y_test, predictions))
    dt_precision_mean.append(precision_score(y_test, predictions))
    dt_recall_mean.append(recall_score(y_test, predictions))
    dt_f1_mean.append(f1_score(y_test, predictions))


    x_worst_train = x_train.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])
    x_worst_test = x_test.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])

    clf.fit(x_worst_train,y_train)
    predictions_worst = clf.predict(x_worst_test)

    dt_worst_accuracy_mean.append(accuracy_score(y_test, predictions_worst))
    dt_worst_precision_mean.append(precision_score(y_test, predictions_worst))
    dt_worst_recall_mean.append(recall_score(y_test, predictions_worst))
    dt_worst_f1_mean.append(f1_score(y_test, predictions_worst))


    x_mean_train = x_train.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'])
    x_mean_test = x_test.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'])

    clf.fit(x_mean_train,y_train)
    predictions_mean = clf.predict(x_mean_test)

    dt_mean_accuracy_mean.append(accuracy_score(y_test, predictions_mean))
    dt_mean_precision_mean.append(precision_score(y_test, predictions_mean))
    dt_mean_recall_mean.append(recall_score(y_test, predictions_mean))
    dt_mean_f1_mean.append(f1_score(y_test, predictions_mean))


    x_se_train = x_train.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'])
    x_se_test = x_test.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'])

    clf.fit(x_se_train,y_train)
    predictions_se = clf.predict(x_se_test)

    dt_se_accuracy_mean.append(accuracy_score(y_test, predictions_se))
    dt_se_precision_mean.append(precision_score(y_test, predictions_se))
    dt_se_recall_mean.append(recall_score(y_test, predictions_se))
    dt_se_f1_mean.append(f1_score(y_test, predictions_se))

def show_mean_plot(type,normal,worst,mean_dt,se):
    plt.title("Normal: {:.4f} - Worst: {:.4f} - Mean: {:.4f} - SE: {:.4f}".format(mean(normal), mean(worst), mean(mean_dt), mean(se)))
    plt.plot(normal, label = 'Normal ' + type)
    plt.plot(worst, label = 'Worst ' + type)
    plt.plot(mean_dt, label = 'Mean ' + type)
    plt.plot(se, label = 'SE ' + type)
    plt.legend(loc = 'lower right')



In [None]:
show_mean_plot("accuracy",dt_accuracy_mean,dt_worst_accuracy_mean,dt_mean_accuracy_mean,dt_se_accuracy_mean)

In [None]:
show_mean_plot("precision",dt_precision_mean,dt_worst_precision_mean,dt_mean_precision_mean,dt_se_precision_mean)

In [None]:
show_mean_plot("recall",dt_recall_mean,dt_worst_recall_mean,dt_mean_recall_mean,dt_se_recall_mean)

In [None]:
show_mean_plot("f1_score",dt_f1_mean,dt_worst_f1_mean,dt_mean_f1_mean,dt_se_f1_mean)

As we can see, in general, the worst dataset produces the best results

The fact that the values are all very close might be explained by the fact that the Decision tree algorithm uses the greedy algorithm at each split finding a local optima, since "worst" values give the best results, all the datasets containing that values are going to be very similar. <br>
A way to improve decision trees is to avoid overfitting. This could be made by limiting maximum depth, pruning etc.
When we prune a tree we are correcting it after it has been fitted to the training dataset. It starts at the leaf nodes and removes those branches that do not affect the overall tree accuracy. It also lowers the complexity of the model.
<br>
In our code we can use sklearn to prune our trees using the ``ccp_alpha`` attribute inside the ``DecisionTreeClassifier`` function.


#### Cross validation

As we said before, our tree could be overfitted, to overcome that we will try to find a better model using Grid Search Cross Validation. As parameters, we will use the criterion and compare 'gini' and 'entropy', max_depth which we will compare 5 different values, min_samples_split with a range between 2 and 10,min_samples_lead with a range between 1 and 5, ccp_alpha to prune the tree with values from 0 to 0.03 with an interval of 0.005 to try and find the best ccp_alpha, and finally, max_features with 'sqrt' and 'log2'.

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf = DecisionTreeClassifier() 
x_worst_train = x_train.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])
x_worst_test = x_test.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])


param_grid = {
    "criterion": ['gini','entropy'],
    "max_depth": [4,5,6,7,8],
    "min_samples_split": range(2,10),
    "min_samples_leaf": range(1,5),
    "ccp_alpha": [0,0.005,0.01,0.015,0.02,0.025,0.03],
    "max_features": ['sqrt','log2']
}
clf = DecisionTreeClassifier() 
cv = KFold(n_splits=10)

grid_search = GridSearchCV(clf, param_grid, scoring='recall', cv=cv) # Perform grid search with cross-validation
grid_search.fit(x_worst_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

best_prediction = best_model.predict(x_worst_test)

best_accuracy = accuracy_score(y_test, best_prediction)
best_precision = precision_score(y_test, best_prediction)
best_recall = recall_score(y_test, best_prediction)
best_f1 = f1_score(y_test, best_prediction)

show_plot(best_prediction, best_accuracy, best_precision, best_recall, best_f1)

Having our improved model, we can see its parameters using `.get_params()` function

In [None]:
best_model.get_params()

A good thing about decision tree, is that we can visualize the path followed by the algorithm to predict it's values. The next code, prints us the tree of the best model.

In [None]:
from sklearn import tree
from matplotlib import pyplot as plt

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(best_model, 
                   feature_names=x_worst_train.columns,  
                   class_names={0:'Malignant', 1:'Benign'},
                   filled=True,
                  fontsize=12)

### Support Vector Machine

Suppot vector machines (SVM) attempt to find a hyperplane in an space of N dimensions, where N is the number of classification attributes. <br>This hyperplane is used to separate the data into two distinct classes, in our case, benign and malignant cancer.

We will use the same datasets as before, and start by an SVM classifier object and fit it to our training data. We then make a prediction using the test data and calculate the accuracy and precision, as well as the confusion matrix.

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, accuracy_score

svm = SVC()
svm.fit(x_train, y_train)

svm.get_params()

In [None]:
y_pred = svm.predict(x_test)


svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)
svm_f1 = f1_score(y_test, y_pred)

In [None]:
show_plot(y_pred, svm_accuracy, svm_precision, svm_recall, svm_f1)

As we can see in the confusion matrix, the SVM classifier shows promising results. However, we can try to improve it by removing attributes that could muddle the model. This would also reduce the number of dimensions we have to deal with, which simplifies the model.
<br> We'll do this by dividing the features into three groups: worst, mean and standard error. We'll then run the SVM classifier on each group and compare the results.

#### Dataset with only "Worst" features

In [None]:
x_worst_train = x_train.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])
x_worst_test = x_test.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'])

svm.fit(x_worst_train,y_train)
svm_predictions_worst = svm.predict(x_worst_test)

svm_accuracy_worst = accuracy_score(y_test, svm_predictions_worst)
svm_precision_worst = precision_score(y_test, svm_predictions_worst)
svm_recall_worst = recall_score(y_test, svm_predictions_worst)
svm_f1_worst = f1_score(y_test, svm_predictions_worst)

show_plot(svm_predictions_worst, svm_accuracy_worst, svm_precision_worst, svm_recall_worst, svm_f1_worst)

#### Dataset with only "Mean" features

In [None]:
x_mean_train = x_train.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'])
x_mean_test = x_test.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'])



svm.fit(x_mean_train,y_train)
svm_predictions_mean = svm.predict(x_mean_test)

svm_accuracy_mean = accuracy_score(y_test, svm_predictions_mean)
svm_precision_mean = precision_score(y_test, svm_predictions_mean)
svm_recall_mean = recall_score(y_test, svm_predictions_mean)
svm_f1_mean = f1_score(y_test, svm_predictions_mean)

show_plot(svm_predictions_mean, svm_accuracy_mean, svm_precision_mean, svm_recall_mean, svm_f1_mean)

#### Dataset with only "Standard Error" features

In [None]:
x_se_train = x_train.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'])
x_se_test = x_test.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'])



svm.fit(x_se_train,y_train)
svm_predictions_se = svm.predict(x_se_test)

svm_accuracy_se = accuracy_score(y_test, svm_predictions_se)
svm_precision_se = precision_score(y_test, svm_predictions_se)
svm_recall_se = recall_score(y_test, svm_predictions_se)
svm_f1_se = f1_score(y_test, svm_predictions_se)

show_plot(svm_predictions_se, svm_accuracy_se, svm_precision_se, svm_recall_se, svm_f1_se)

From the previous tests we can conclude that, just like with the decision tree, the dataset with the worst features has the best results.
<br> We can also see that SVM gets better values all around compared to the decision tree, which is expected since SVM can handle outliers better than the decision tree by allowing some misclassification. 

We are getting good results but we can still try to improve our model. From now on we'll only use the dataset with the "Worst" features, and we'll try to improve the model by tuning the hyperparameters. We'll use the GridSearchCV function from sklearn to find the best hyperparameters for our model.

In [None]:
from sklearn.model_selection import GridSearchCV, KFold


param_grid = {
    'C': [0.1, 1, 4], #penalty parameter of error term -> increase can lead to over fitting
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4], #1 == linear, 
    'gamma': ['scale', 'auto'], #the higher the gamma, the more influence closer points have
    'shrinking': [True, False],
    'verbose': [2]
}

cv = KFold(n_splits=10)

grid_search = GridSearchCV(svm, param_grid, scoring='recall', cv=cv) # Perform grid search with cross-validation
grid_search.fit(x_worst_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

best_prediction = best_model.predict(x_worst_test)

best_accuracy = accuracy_score(y_test, best_prediction)
best_precision = precision_score(y_test, best_prediction)
best_recall = recall_score(y_test, best_prediction)
best_f1 = f1_score(y_test, best_prediction)

show_plot(best_prediction, best_accuracy, best_precision, best_recall, best_f1)