# Finding the best model to fit your data

In [None]:
import pandas as pd
df = pd.read_csv('sonar.all-data.csv',header=None)
#all rows all columns but last
sonar = df.iloc[:, :-1]
#all rows, only the last column
sonar_class = df.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split
#test_size is the percentage of the test size to the complete dataset
# random_state is the seed. A specific pseudorandom number to split the data set
# in order to produce same splitting every time we run the script.
sonar_train, sonar_test, sonar_class_train, sonar_class_test = train_test_split(sonar, sonar_class, test_size = 0.25, random_state = 42)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame
#name StandardScaler()
scaler = StandardScaler()
#call it for the data (the result is array)
sonar_train_scaled = scaler.fit_transform(sonar_train)
sonar_test_scaled =scaler.transform(sonar_test)

# SVM classification algorithm

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0, probability = True)
classifier.fit(sonar_train_scaled, sonar_class_train)

## The prediction of the type M or F

In [None]:
sonar_test_pred = classifier.predict(sonar_test_scaled)

In [None]:
sonar_test_pred

## The test set responses

In [None]:
sonar_class_test

## Confusion matrix calculation

In [None]:
#Calculating the confusion matrix and the accuracy 
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(sonar_class_test, sonar_test_pred)
print(cm)
accuracy_score(sonar_class_test, sonar_test_pred)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(sonar_class_test, sonar_test_pred))

## Plotting the ROC curve

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score

y_pred_prob_svm = classifier.predict_proba(sonar_test) # predicted probabilities

# rename M,R to 0,1 

sonar_class_test_roc = pd.factorize(sonar_class_test)[0].tolist()

fpr, tpr, _ = roc_curve(sonar_class_test_roc, y_pred_prob_svm[:,1])

plt.plot(fpr, tpr, label="svm")

plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18);
plt.legend(fontsize=15)

## Calculating the Area Under the Curve (AUC)

In [None]:
print(f'model 1 AUC score: {roc_auc_score(sonar_class_test, y_pred_prob_svm[:,1])}') 

## Cross validation

Cross-validation is a technique for validating the model efficiency, by training a model to
a subset of input (train) data and validating (testing) it on the unseen data left. This is
something different from the general train-test split.
We can divide the methods into two subcategories. The exhaustive and not exhaustive
methods. Here the most common methods are mentioned, among many.

### Exhaustive. 
The idea involves testing the model in all possible ways, it involves splitting the
data in all possible ways.

1. Leave one out cross validation

2. Leave p-out cross validation

### Non exhaustive 
Non-Exhaustive: In this method, the original data set is not separated into all the possible
permutations and combinations.

1. The single validation (hold-out test)

2. k-fold cross validation

#### k-fold cross validation

Divide the input dataset into K groups of samples of equal
sizes. These samples are called folds. For each learning set, the prediction function
uses k-1 folds, and the rest of the folds are used for the test set. The estimate for the
performance of the model is the mean of the accuracies of each fold.

### Grid Search

The majority of machine learning algorithms contain parameters that can be adjusted to vary how the model learns.
These parameters are called hyperparameters.
In the case of SVC that we are using here these are  c and kernel values.

## K-fold plus grid search application in python

In [None]:
## Cross validation, here K-fold
# 5-fold cross validation with initial seed=42
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

## grid search using two hyperparameters (take a look of SVC() help)
grid = { 
'C': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0],
'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

model=SVC()

## 
SVC_cv_grid = GridSearchCV(estimator=model, param_grid=grid, scoring='accuracy', cv=kfold)
SVC_cv_grid.fit(sonar_train_scaled, sonar_class_train)

In [None]:
# Print the best parameters and the best score
print("Best parameters found: ", SVC_cv_grid.best_params_)
print("Best cross-validation score: ", SVC_cv_grid.best_score_)

In [None]:
# Predict on the test set using the best estimator
sonar_test_prediction = SVC_cv_grid.best_estimator_.predict(sonar_test_scaled)

In [None]:
confusion_matrix(sonar_class_test,sonar_test_prediction)

In [None]:
accuracy_score(sonar_class_test,sonar_test_prediction)

In [None]:
# Print the classification report
print(classification_report(sonar_class_test, sonar_test_prediction))

In [None]:

accuracy = SVC_cv_grid.best_estimator_.score(sonar_test_scaled,sonar_class_test)
print("Accuracy on test set:", accuracy)