In [5]:
import numpy as np  # for handling multi-dimensional array operation
import pandas as pd  # for reading data from csv 
import statsmodels.api as sm  # for finding the p-value 
from sklearn.preprocessing import MinMaxScaler  # for normalization

from sklearn.model_selection import train_test_split as tts # to split our data into train and test samples.
#If you want to see how to implement  this split from scratch you can check out my other project Glass Classification using KNN from Scratch in my profile.

from sklearn.metrics import accuracy_score # for calculating our accuracy in the end 
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

# Data Preparation

In [6]:
data = pd.read_csv('./breast-cancer/data.csv')

In [7]:
data.head() 

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


Since SVM accepts numerical values we need to transform the values in diagnosis feature. 
- M --> Malignant(cancer), 
- B --> Benign(harmless)

Other than that first and last colums are useless for us. So lets drop those columns.

In [8]:
diagnosis_map = {'M':1, 'B':-1}  #We use -1 instead of 0 because of how SVM works. 
data['diagnosis'] = data['diagnosis'].map(diagnosis_map)

data.drop(data.columns[[-1, 0]], axis=1, inplace=True) # axis 1 -> columns, 'inplace = True' means we do the drop operation inplace and return None.

In [9]:
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Train-test Split and Normalization

#### Train-test Split: 
_We split our data into two seperate subsets which are train and test subsets. Like the names suggest we use train subset to train our model and test subset to test how is our model performing. Our objective doing this is to estimate the performance of the machine learning model on data not used to train the model so that we can get an objective result how is our model performing._

#### Normalization:
_Normalization is basically scaling the values to use some common scale without losing information. If data we are using has some features that has drastically different scale of numbers then normalization is crucial for algorithms that uses numeric values. If we don't use normalization in these kind of algorithms that huge difference could cause problems when we combine those features during modeling._

In [10]:
y = data.loc[:, 'diagnosis']  # Select diagnosis column.

X = data.iloc[:, 1:]  # Select columns other than diagnosis.

X_normalized = MinMaxScaler().fit_transform(X.values) # Scaling the values in X between (0,1).
X = pd.DataFrame(X_normalized)

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)


# SVM IMPLEMENTATION


“Support Vector Machine” (SVM) is one of the supervised machine learning algorithms. It is generally used for binary classification. Suppose we have a dataset with labeled examples($x_i$, $y_i$); using this dataset algorithms defines a hyperplane that has the following properties:
- Goal of this hyperplane is to create a seperation between two classes with a maximum margin. 
- Our equation for algorithm is $wx + b = 0$. Our equation for margin is $2/{||w||}$
- $wx_i + b >= 1$ cho $y_i = 1$ và $wx_i + b <= 1$ for $y_i = -1$; so our condition for correct classification is equation and class has to have same sign and equation for our condition is $y_i(wx_i + b) >= 1$.

So to define this hyperplane we need come up with optimal values for w and b. For this we will used the cost function and then apply gradient descent. The lost function we will be using is hinge loss. Equation of hinge loss is $l = max(0, 1 - y_i(wx_i + b))$. What this means is,
- If l is 0, that means '1 - $y_i(wx_i + b)$' is a smaller or equal to 0. For this to happen $y_i(wx_i + b)$ has to be bigger or equal to 1. We already know that '$y_i(wx_i + b) >= 1$' is our condition for correct classification. So to sum it up, if l is equal to 0 that means classification we did is correct.
- If l is something other than 0,  that means our classification is wrong.

On top of the hinge loss to we also add a regularization parameter to balance the margin maximization and loss. Our final function when we add this parameter will be 
J = λ\$||w||^2$ + $\frac{1}{n}$  $\sum_{i=1}^{n} max(0, 1 - y_i(wx_i + b)) $.

Now that we have the loss function, we take the partial derivatives with respect to the weights and bias to find gradients.<br/>
If $y_i(w*x_i + b)) >= 1$:
- $\frac{dJ}{w}$ = 2λw
- $\frac{dJ}{b}$ = 0 <br/>
 
Else:
- $\frac{dJ}{w}$ = 2λw - $y_ix_i$
- $\frac{dJ}{b}$ = $y_i$

Our update rule based on those derivatives are:
- w = w - $\alpha * dw$
- b = b + $\alpha * db$




In [11]:
class SVM:
    
    def init(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate # 𝛼 in formula
        self.lambda_param = lambda_param
        self.n_iters = n_iters 
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                self.update(x_i, y[idx])

    def update(self,x,y):
        distance = 1 - (y * (np.dot(x, self.w) + self.b))
        hinge_loss = max(0,distance)
        if(hinge_loss == 0):
            self.w = self.w - self.lr * (2 * self.lambda_param * self.w)
        else: 
            self.w = self.w - self.lr * (2 * self.lambda_param * self.w - np.dot(x,y))
            self.b = self.b + self.lr * y
        
        
    def predict(self, X):
        eq = np.dot(X, self.w) + self.b
        return np.sign(eq)


In [12]:
clf = SVM()
clf.init()
clf.fit(X_train.to_numpy(), y_train.to_numpy())

In [13]:
y_test_predicted = clf.predict(X_test.to_numpy())

In [14]:
print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_test_predicted)))
print("recall on test dataset: {}".format(recall_score(y_test.to_numpy(), y_test_predicted)))
print("precision on test dataset: {}".format(precision_score(y_test.to_numpy(), y_test_predicted)))


accuracy on test dataset: 0.9649122807017544
recall on test dataset: 0.9069767441860465
precision on test dataset: 1.0
