# Scikit-learn

In [6]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np

df = pd.read_csv("../Data/Advertising.csv", index_col=0)
print(f"Number of features {df.shape[1]-1}")
print(f"Number of samples {df.shape[0]}")

df.head()

Number of features 3
Number of samples 200


Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [10]:
X, y = df.drop("Sales", axis = 1), df["Sales"]
X.shape, y.shape

((200, 3), (200,))

## Scikit-learn "recipe"

**Steps**

1. train|test split or train|validation|test split
2. Scale dataset 
    - many algorithms require scaling, some don't
    - which type of scaling method to use?
    - scale training data using training data, scale test data using training data
3. Fit algorithm to training data
4. Predict on test data
5. Evaluation metrics on test data

### Train|test split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

### Feature scaling

Normalization (MinMaxScaling)

$$X' = \frac{X-X_{min}}{X_{max}-X_{min}}$$

Compute min and max from training data and use on training and test data

Feature standardization (standard score scaling)

$$X' = \frac{X-\mu}{\sigma}$$

$\mu$ and $\sigma$ computed from training data

In [24]:
from sklearn.preprocessing import MinMaxScaler

# instantiate an object from the class MinMaxScaler()
scaler = MinMaxScaler()
print(type(scaler))

# do scaler.fit on X_train - NOT on X_test
scaler.fit(X_train)

# transform both X_train and X_test
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# training data scaled to 0-1
print(f"Min value in X_train: {scaled_X_train.min()}")
print(f"Max value in X_train: {scaled_X_train.max()}")

print(f"Min value in X_test: {scaled_X_test.min()}")
print(f"Max value in X_test: {scaled_X_test.max()}")

<class 'sklearn.preprocessing._data.MinMaxScaler'>
Min value in X_train: 0.0
Max value in X_train: 1.0
Min value in X_test: 0.005964214711729622
Max value in X_test: 1.1302186878727631


### Algorithm - linear regression

In [31]:
from sklearn.linear_model import LinearRegression

# SVD - Singular Value Decomposition that is used for calculating pseudoinverse in OLS normal equation

# instantiate an object
model_SVD = LinearRegression() 
model_SVD.fit(scaled_X_train, y_train)

# weights 
print(f"Weights (beta_hats) {model_SVD.coef_}")
print(f"Intercept {model_SVD.intercept_}")

Weights (beta_hats) [13.02832938  9.88465985  0.69237469]
Intercept 2.741855324852814


## stochastic gradient descent (SGD)

In [38]:
from sklearn.linear_model import SGDRegressor

# note that SGD requires features to be scaled
model_SGD = SGDRegressor(loss = "squared_error", learning_rate="invscaling", max_iter=100000)
model_SGD.fit(scaled_X_train, y_train)

print(f"Weights (beta_hats) {model_SGD.coef_}")
print(f"Intercept {model_SGD.intercept_}")


Weights (beta_hats) [11.95478041  8.99425491  1.33777838]
Intercept [3.58740602]


## Manual test

In [49]:
# sanity check
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

# uses the weights and intercept from the fitting
model_SGD.predict(test_sample_features)[0], model_SVD.predict(test_sample_features)[0], test_sample_target

(16.590805606878796, 16.565396297434837, 16.9)

## Evaluation

In [51]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred_SVD = model_SVD.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)

mae_SVD = mean_absolute_error(y_test, y_pred_SVD)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

mse_SVD = mean_squared_error(y_test, y_pred_SVD)
mse_SGD = mean_squared_error(y_test, y_pred_SGD)

rmse_SVD = np.sqrt(mse_SVD)
rmse_SGD = np.sqrt(mse_SGD)

print(f"SVD: MAE {mae_SVD:.2f}, MSE {mse_SVD:.2f}, RMSE {rmse_SVD:.2f}")
print(f"SGD: MAE {mae_SGD:.2f}, MSE {mse_SGD:.2f}, RMSE {rmse_SGD:.2f}")

SVD: MAE 1.51, MSE 3.80, RMSE 1.95
SGD: MAE 1.52, MSE 4.09, RMSE 2.02
