In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

# Classification 1

The iris flower: https://www.fs.usda.gov/wildflowers/beauty/iris/flower.shtml

In [None]:
data = datasets.load_iris()
df = pd.DataFrame(data["data"], columns=data["feature_names"]).drop(columns=["petal length (cm)"])
df.insert(2, "const", 1)
df["variety"] = data["target_names"][data["target"]]
df.insert(4, "setosa", df["variety"] == "setosa")

In [None]:
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
train, test = train_test_split(df, test_size=10, random_state=5)
test

### Model 1: Predict petal width

- regression problem

In [None]:
xcols = ["sepal length (cm)", "sepal width (cm)", ???]
ycol = ???

# 1. initialize model
reg_model = LinearRegression(???)
# 2. fit using train data
reg_model.???
# 3. predict for test data and add predictions as a column
test[???] = reg_model.???
test

Review of `score` method that enables us compute R^2 score.

In [None]:
reg_model.???

##### Math behind model 1: how does it predict?

Recall that we can extract co-efficients using `<model obj>.coef_`.

In [None]:
reg_model.???

In [None]:
# converting into a vertical array / vector and assigning values to c
c = reg_model.coef_.???
c

In [None]:
# assign X
X = test[xcols].values
X

Let's use Linear Algebra to do the prediction `y = X @ c`.

In [None]:
y = X @ c
y

In [None]:
# comparing with model predictions
test["pet_width_predictions"]

## LogisticRegression

- classification model
- predict categorical labels

### Model 2 - part a: Predict whether flower is "setosa"

- classification problem, specifically binary classification: True / False

In [None]:
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
ycol = ???

# 1. initialize model
cls_model = ???
# 2. fit using train data
cls_model.fit(train[xcols], train[ycol])
# 3. predict for test data and add predictions as a column
test[???] = cls_model.predict(test[xcols])
test

What is the accuracy? That is what percent of the time is it correct?

In [None]:
cls_model.score(test[xcols], test[ycol])

##### Math behind model 2 - part a: how does it predict?

In [None]:
cls_model.coef_.reshape(-1, 1)

In [None]:
# converting into a vertical array / vector and assigning values to c
c = cls_model.coef_.T

In [None]:
# assign X
X = test[xcols].values
X

Let's use Linear Algebra to do the prediction `y = X @ c`.

In [None]:
X @ c

Negative values => False and positive => True.

In [None]:
X @ c ???

In [None]:
# comparing with model predictions
test["setosa_predictions"]

### Model 2 - part b: Predict probablity of flower being "setosa"

- classification problem, probablity between 0 to 1

#### `<model object>.predict_proba(X)`

- Calibrated probabilities of classification.
- returns a numpy array of probabilities (that it is True):
    - [[False probablity, True probablity], ...]
- documentation: https://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html#sklearn.calibration.CalibratedClassifierCV.predict_proba

In [None]:
cls_model.???

Extract just the True probablities.

In [None]:
cls_model.predict_proba(test[xcols])???

In [None]:
test["setosa_prob"] = cls_model.predict_proba(test[xcols])[:, 1]
test

##### Math behind model 2 - part b: how does it predict?

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

x = np.arange(-10, 10, 0.1)
y = sigmoid(x)
plt.plot(x, y)

Let's use Linear Algebra to do the prediction `y = X @ c`.

In [None]:
# Predict the probability of True
???

In [None]:
# comparing with model predictions
test["setosa_prob"]

### Model 4: Predict variety of flower

- classification problem, specifically multi-class classification for `variety`

In [None]:
# PREDICT: which of the 3 varieties is a particular Iris?
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
ycol = ???

# 1. initialize model
mult_model = ???
# 2. fit using train data
mult_model.fit(train[xcols], train[ycol])
# 3. predict for test data and add predictions as a column
test[???] = mult_model.predict(test[xcols])
test

What is the accuracy?

In [None]:
mult_model.score(test[xcols], test[ycol])

##### Math behind model 3: how does it predict?

In [None]:
mult_model.coef_

In [None]:
# taking transpose and assigning values to c
c = mult_model.coef_.T
c

Each column contains coefficients for scoring a different `variety`.

In [None]:
c.shape # 3 varities

How can we determine name of each `variety`? 

#### `<model object>.classes_`

- ndarray containing label for each class.
- documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [None]:
mult_model.???

In [None]:
# assign X
X = test[xcols].values
X

Extracting "setosa" co-efficients.

In [None]:
# we need a 2-D array
setosa_c = mult_model.coef_.T[:, [0]]
setosa_c

Let's use Linear Algebra to do the prediction just for "setosa" `y = X @ c`.

In [None]:
X @ setosa_c

Repeating for other varities.

In [None]:
# versicolor scores
versicolor_c = mult_model.coef_.T[:, [1]] 
X @ versicolor_c

In [None]:
# virginica scores
virginica_c = mult_model.coef_.T[:, [2]] 
X @ virginica_c

#### Review: how does `MATRIX @ vertical vector` work?

- MATRIX (data) @ vertical vector (coef) gets computed using the below LOOP:
    - For each DATA ROW of MATRIX
        - ROW (horizontal) @ c (vertical)

#### How does `MATRIX 1 @ MATRIX 2` work?

- MATRIX 1 (data) @ MATRIX 2 (coef) gets computed using the below NESTED LOOP:
    - For each DATA ROW of MATRIX 1
        - For each COEFFICIENT COLUMN of MATRIX 2
            - compute the vector @ vector
            
Computing all predictions.            

In [None]:
# all varities
c = mult_model.coef_.T
X @ c
# Largest score determines the classification prediction output

In [None]:
test["variety_predictions"]

### `max`, `argmax` methods on `np.array` 

- `max`: returns max value
- `argmax`: returns index of the max value
- same idea for min, argmin

In [None]:
a = np.array([1, 2, 9, 8, 7])
a.max()

In [None]:
a.???

Let's convert `X @ c` to actual label predictions.

In [None]:
# gives the index of tha max value
(X @ c).???

In [None]:
# we want max of each row
(X @ c).???

In [None]:
mult_model.???

`<model obj>.classes_` can return labels given a list of indices.

In [None]:
mult_model.classes_???

Putting `argmax` and `classes_` together.

In [None]:
???

In [None]:
pd.Series(mult_model.classes_[(X @ c).argmax(axis=1)])

In [None]:
test["variety_predictions"]