# Chapter 1

### KNN

```
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# First convert dataset to numpy since sklearn uses numpy
y = df['target'].values
X = df.drop('target', axis=1).values
# Normalize the whole dataset before modeling
X = preprocessing\
	.StandardScaler()\
	.fit(X)\
	.transform(X.astype(float))
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Initialize and train model
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='minkowski')
knn.fit(X_train, y_train)
# Predict the test set class with the trained model
predicted_y = knn.predict(X_test)
# Measure probability score of prediction for the test set with the trained model
predicted_y_prob = knn.predict_proba(X_test)
# Measure accuracy on testing set
print(accuracy_score(y_test, predicted_y)*100)
# Visualize normal distribution of accuracy for different Ks
# Compute the above steps for different K and find mean, std etc
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks),mean_acc - 3 * std_acc,mean_acc + 3 * std_acc, alpha=0.10,color="green")
plt.legend(('Accuracy ', '+/- 1xstd','+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()
# Plot complexity graph with list of train and test accuracies
plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy")
plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy")
```

### Logistic Regression

```
from sklearn.metrics import confusion_matrix
# Specify independent and dependent features
X = np.asarray(df[['A', 'B', 'C', 'D', 'E', 'F', 'G']])
y = np.asarray(df['target'])

# Preprocess dataset
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

# Split into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

# Train the model
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C=0.01, solver='liblinear')
LR.fit(X_train,y_train)

# Predict the test set
y_pred = LR.predict(X_test)

# See classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
classification_report(y_test, y_pred)
confusion_matrix(y_test, y_pred, labels=[1,0])

# Predicted probability on test set for positive/target class
y_pred_prob = LR.predict_proba(X_test)[:, 1]

# Evaluate the model
from sklearn.metrics import jaccard_score
jaccard_score(y_test, y_pred,pos_label=0)

from sklearn.metrics import log_loss
log_loss(y_test, y_pred_prob)

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_pred_prob))
```

### SVM

```
# Method 1
from sklearn.svm import LinearSVC
# OR from sklearn.svm import SVC 
# instatiate a scikit-learn SVM model
# to indicate the class imbalance at fit time, set class_weight='balanced'
# for reproducible output across multiple function calls, set random_state to a given integer value
svm = LinearSVC(class_weight='balanced', random_state=42, loss="hinge", fit_intercept=False) 
# svm = SVC(kernel='linear', gamma=.5, probability=True)  # Another way
# train a linear Support Vector Machine model using Scikit-Learn
t0 = time.time()
svm.fit(X_train, y_train)
sklearn_time = time.time() - t0

# Method 2 : Use snapml library
# in contrast to scikit-learn's LinearSVC, Snap ML offers multi-threaded CPU/GPU training of SVMs
from snapml import SupportVectorMachine
snapml_svm_gpu = SupportVectorMachine(class_weight='balanced', random_state=42, use_gpu=True, fit_intercept=False)
snapml_svm_cpu = SupportVectorMachine(class_weight='balanced', random_state=42, n_jobs=4, fit_intercept=False)
t0 = time.time()
model = snapml_svm_cpu.fit(X_train, y_train)
snapml_time = time.time() - t0

# Predict
y_pred = svm.predict(X_test)

# Evaluate model
roc_auc_score(y_test, y_pred)

# Get confidence score for probability
y_pred_conf = svm.decision_function(X_test)

# Evaluate hinge loss
hinge_loss(y_test, y_pred_conf)


```

### Key Terms / Jargons

- decision boundary: the surface separating different predicted classes
- linear classifier: a classifier that learns linear decision boundaries
- linearly separable: a data set can be perfectly explained by a linear classifier
- loss function : a function that provides penalty score that determines how poorly the model performs

<center><img src="images/01.01.png"  style="width: 400px, height: 300px;"/></center>
<center><img src="images/01.02.png"  style="width: 400px, height: 300px;"/></center>

### Overfitting, Underfitting, Bias variance tradeoff

- Overfitting : 
    - Model also memorises / trains on noise that resides within training data. 
    - Model performs well when evaluating on training data but does not perform well on unseen data
    - High variance is responsible for this error because of also capturing noise.
    - Diagnosis: cross-val prediction on test set has high error than prediction on train set
    - Possible remedy : Decrease model complexity, gather more data, 
- Underfitting :
    - Model is too simple to catch the pattern, model is not good enough to capture the underlying pattern.
    - Model is bad on both training and unseen data
    - Model is not flexibple enough to approximate the prediction values
    - High bias is responsible for this error
    - Diagnosis: cross-val prediction on train and test set are roughly equal but have very high errors that is undesirable
    - Possible remedy : Increase model complexity, gather more features, 
- Bias-Variance trade-off :
    - Generalization error = bias^2 + variance + irreducable error (noise)
    - bias = error term that tells how on average real value is different from predicted value
    - variance = error term that tells how predicted value varies over different training sets
    - When model complexity increases, variance increases and bias decreases
    - When model complexity decreases, variance decreases and bias increases
    - The sweet spot is the minimised generalization error, which gives the optimised model

# Chapter 2

### Prediction

- dot product of features and co-efficients

```
# Under the hood, prediction, y = dot product of co-efficient and X
# changing intercept shifts the boundary up or down
# changing co-efficient changes the sloper of the boundary
y = model.coef_ @ X + model.intercept_ 
```

### Logistic Regression vs Linear Regression

- Linear regression:
    - Finds a line that fits and aligns tightly with the data
    - goal: line is the trend, any new value will appear *ON* the line
    - Predicts the value itself
    - Predicted value is a continuous value that exceeds 0 or 1
- Logistic regression
    - Finds a line / plane that separates the data by maximizing the distance
    - goal : Line is a no-man's land. New value will appear on *EITHER SIDE* of the line
    - predicts which class will the value fall in (sigmoid of the value).
    - Predicted value is a discrete value that should be between 0 or 1
    - construction: https://vitalflux.com/wp-content/uploads/2022/03/logistic-regression-model-3.png

### Why cannot we use Linear regression in Linear classification?


- regression loss : 
    - loss is higher when it is further away from true target value. 
    - loss happens both ways (since it is continuous value).
    - squared loss curve perfectly captures this behavior
    - Goal : Capture the closeness of values to the original continuous value on both side (positive or negative)
- logistic loss: 
    - loss is higher only for incorrect classifications. 
    - loss happens in one direction (since it is binary classification.)
    - squared loss captures only one direction correctly, the other direction mistakes as "the perfect model also has squared loss, and so the perfect model is the worst model"
    - Goal : Capture the probability of the incorrectly classified values on incorrect side (sign does not matter as long as the dicrete value is an incorrect value. Correct value has 0 loss)
    - we need to eliminate the mistaken side by introducing the logistic function, that only takes range from 0 to 1.
- Hinge loss:
    - Correct prediction has 0 loss
    - Incorrect prediction has linear loss
- 0-1 loss:
    - It counts the number of misclassifications and averages it over the total number of samples.

<center><img src="images/02.01.png"  style="width: 400px, height: 300px;"/></center>
<center><img src="images/02.04.png"  style="width: 400px, height: 300px;"/></center>


### Optimization (minimization problem)

- This is how gradient descent works
- We need to find the minimum value for a given function (eg:  loss function)


```
import numpy as np
from scipy.optimize import minimize

# Define a cubic function to minimize: z = Ax^3 + By + C
def cubic_function(x, A, B, C):
    return A * x[0]**3 + B * x[1] + C

# Coefficients for the cubic function
A_coefficient = 2.0
B_coefficient = -3.0
C_constant = 5.0

# Initial guess
initial_guess = [1, 1]

# Minimize the cubic function
result = minimize(cubic_function, initial_guess, args=(A_coefficient, B_coefficient, C_constant), method='Nelder-Mead')

# Print the result
print("Minimum found at x:", result.x)
print("Minimum function value (z):", result.fun)

```

### Log loss and hinge loss

```
import numpy as np
import matplotlib.pyplot as plt
# Mathematical functions for logistic and hinge losses
def log_loss(raw_model_output):
   return np.log(1+np.exp(-raw_model_output))
def hinge_loss(raw_model_output):
   return np.maximum(0,1-raw_model_output)

# Create a grid of values and plot
grid = np.linspace(-2,2,1000)
plt.plot(grid, log_loss(grid), label='logistic')
plt.plot(grid, hinge_loss(grid), label='hinge')
plt.legend()
plt.show()
```