In [1]:


import numpy as np
import pandas as pd

# Loading Data

In [2]:
df = pd.read_csv('Fish.csv')

In [3]:
TARGET_COL = 'Weight'
INDEPENDENT_COLS = [col for col in df.columns if col != TARGET_COL]

TARGET_COL, INDEPENDENT_COLS

('Weight', ['Species', 'Length1', 'Length2', 'Length3', 'Height', 'Width'])

In [4]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


### Adding Random Columns: Rand1 and Rand2
Random values are generated using **numpy.random.randint()**.

In [5]:
df["Rand1"] = np.random.uniform(low=10, high=100, size=(df.shape[0],))
df["Rand2"] = np.random.randint(low=1, high=7, size=(df.shape[0],))
INDEPENDENT_COLS = [col for col in df.columns if col != TARGET_COL]

In [6]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,Rand1,Rand2
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02,99.656459,1
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056,66.824549,2
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961,25.374147,1
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555,47.822137,1
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134,70.540042,1


## Data Preprocessing

The Species column is categorical, and before applying any machine learning algorithms, it must be encoded.

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoder.fit(df[['Species']])
encoded = pd.DataFrame(encoder.transform(df[['Species']]), columns=encoder.get_feature_names_out(['Species']))

In [8]:
df = df.drop(columns=['Species'], axis=1)
df = pd.concat([df, encoded], axis=1)
INDEPENDENT_COLS = [col for col in df.columns if col != TARGET_COL]

## Helper Function: Training and Model Reporting

In [9]:
from sklearn.linear_model import LinearRegression


def train_report_model(_model, X_train: pd.DataFrame, y_train: pd.Series, params: dict = None):
    if params is None:
        params = {}

    model = _model(**params)
    model.fit(X_train, y_train)
    _df = pd.DataFrame(data=[[model.intercept_] + list(model.coef_)], columns=['Intercept'] + list(X_train.columns))
    print(f"R2 Score: {model.score(X_train, y_train)}")
    print(_df.iloc[0])

In [10]:
train_report_model(_model=LinearRegression, X_train=df[INDEPENDENT_COLS], y_train=df[TARGET_COL])

R2 Score: 0.9372844925332875
Intercept           -821.342683
Length1              -84.128383
Length2               84.355121
Length3               31.105391
Height                 2.877077
Width                  5.785822
Rand1                 -0.117922
Rand2                  7.238685
Species_Bream        -88.204998
Species_Parkki        63.580506
Species_Perch         28.934178
Species_Pike        -310.513831
Species_Roach         -2.651481
Species_Smelt        330.269574
Species_Whitefish    -21.413947
Name: 0, dtype: float64


### Model-2

In [11]:
train_report_model(_model=LinearRegression,
                   X_train=df[INDEPENDENT_COLS].drop(columns=["Rand1"]),
                   y_train=df[TARGET_COL])

R2 Score: 0.9372197487179094
Intercept           -826.538915
Length1              -83.726536
Length2               84.509103
Length3               30.587118
Height                 2.944098
Width                  5.740785
Rand2                  7.184654
Species_Bream        -87.392169
Species_Parkki        63.146911
Species_Perch         27.966599
Species_Pike        -311.110736
Species_Roach         -2.777298
Species_Smelt        329.568928
Species_Whitefish    -19.402236
Name: 0, dtype: float64


### Model-3

In [12]:
train_report_model(_model=LinearRegression,
                   X_train=df[INDEPENDENT_COLS].drop(columns=["Rand1", "Rand2"]),
                   y_train=df[TARGET_COL])

R2 Score: 0.9360849020585845
Intercept           -813.013618
Length1              -80.302952
Length2               79.888631
Length3               32.535381
Height                 5.250988
Width                 -0.515438
Species_Bream       -105.318522
Species_Parkki        59.404139
Species_Perch         32.630388
Species_Pike        -313.747879
Species_Roach         -2.278567
Species_Smelt        340.754796
Species_Whitefish    -11.444354
Name: 0, dtype: float64


### Model-4

In [13]:
train_report_model(_model=LinearRegression,
                   X_train=df[INDEPENDENT_COLS].iloc[: 2],
                   y_train=df[TARGET_COL].iloc[: 2])

R2 Score: 1.0
Intercept            263.950200
Length1                1.485372
Length2                1.305675
Length3                1.348357
Height                 0.180080
Width                  0.048807
Rand1                 -1.329386
Rand2                  0.185473
Species_Bream          0.000000
Species_Parkki         0.000000
Species_Perch          0.000000
Species_Pike           0.000000
Species_Roach          0.000000
Species_Smelt          0.000000
Species_Whitefish      0.000000
Name: 0, dtype: float64



## Brief Report
Excluding *Rand1* and *Rand2* has minimal impact on the R2 score, indicating that these variables contribute little to the model's predictive power. In Model-4, an R2 value of 1 suggests a perfect fit, likely due to the extremely limited training data which points to overfitting. Consequently, Model-4 is expected to exhibit high variance and low bias.

# Optimization Task

In [14]:
import scipy.optimize as opt


def regression_function(params: np.array, X: np.ndarray) -> np.array:
    return X.dot(params[1:]) + params[0]


def loss_function(params: np.array, X: np.ndarray, y: np.array) -> np.float64:
    predictions = regression_function(params, X)
    return np.square(predictions - y).sum()

In [15]:
params = np.ones((len(INDEPENDENT_COLS) + 1))

In [16]:
loss_function(params, df[INDEPENDENT_COLS], df[TARGET_COL])

np.float64(25698318.119129784)

In [17]:
optimized_params = opt.minimize(loss_function, params, args=(df[INDEPENDENT_COLS], df[TARGET_COL]))

In [18]:
for _ in optimized_params.x:
    print(_)

-719.4098991403183
-84.12866535907818
84.3562651240242
31.104564749479632
2.87744932683751
5.785195300452773
-0.11792117920752919
7.238692396533915
-190.13782889290508
-38.35293358921335
-72.99881432725655
-412.44608574887025
-104.58359433983324
228.33695563830258
-123.34689819079634


In [19]:
print(INDEPENDENT_COLS)

['Length1', 'Length2', 'Length3', 'Height', 'Width', 'Rand1', 'Rand2', 'Species_Bream', 'Species_Parkki', 'Species_Perch', 'Species_Pike', 'Species_Roach', 'Species_Smelt', 'Species_Whitefish']


In [20]:
from sklearn.metrics import r2_score

r2_score(df[TARGET_COL], regression_function(optimized_params.x, df[INDEPENDENT_COLS]))

0.9372844925323277

### Optimization Summary
It is important to note that the intercept and coefficients of the encoded species variables differ from those in Model-1, likely due to differences in calculation methods. However, the sum of the intercept and coefficients for each one-hot encoded variable is identical across both models. As a result, the predictions of the two models are equivalent.

# Regularization

### Lasso with Upper Bound Constraint

In [21]:
def lasso_bound(params: np.array, bound: float):
    return bound - np.abs(params[1:]).sum()


def test_lasso_bound(bound):
    params = np.zeros(len(INDEPENDENT_COLS) + 1)
    constrs = (
        {"type": "ineq", "fun": lasso_bound, "args": (bound,)}
    )

    optimized_params = opt.minimize(loss_function,
                                    params,
                                    args=(df[INDEPENDENT_COLS], df[TARGET_COL]),
                                    constraints=constrs)

    coef_names = ["Intercept"] + INDEPENDENT_COLS
    for i, _ in enumerate(optimized_params.x):
        print(f"{coef_names[i]}: {_: .4f}")

    r2_score(df[TARGET_COL], regression_function(optimized_params.x, df[INDEPENDENT_COLS]))
    print(f"R2 Score: {r2_score(df[TARGET_COL], regression_function(optimized_params.x, df[INDEPENDENT_COLS]))}")

In [22]:
test_lasso_bound(1000)

Intercept: -814.9856
Length1: -76.0762
Length2:  82.1759
Length3:  25.9664
Height:  2.3655
Width:  6.1159
Rand1: -0.1021
Rand2:  7.1639
Species_Bream: -63.5831
Species_Parkki:  69.2014
Species_Perch:  29.5192
Species_Pike: -307.5523
Species_Roach:  2.3466
Species_Smelt:  325.1280
Species_Whitefish: -2.7035
R2 Score: 0.9372100968971661


In [23]:
test_lasso_bound(100)

Intercept: -548.8743
Length1:  14.2655
Length2:  11.7576
Length3: -0.0362
Height:  18.8620
Width:  7.5482
Rand1: -0.0987
Rand2:  13.2432
Species_Bream: -9.3277
Species_Parkki: -0.1134
Species_Perch:  0.0549
Species_Pike: -11.4312
Species_Roach:  0.1209
Species_Smelt:  11.8927
Species_Whitefish:  1.3203
R2 Score: 0.8874998944822203


In [24]:
test_lasso_bound(40)

Intercept: -487.1686
Length1:  2.9684
Length2:  3.4999
Length3:  20.1430
Height:  8.0332
Width:  1.1152
Rand1: -0.0794
Rand2:  1.6580
Species_Bream: -0.0604
Species_Parkki: -0.0087
Species_Perch:  1.4782
Species_Pike:  0.0336
Species_Roach:  0.6623
Species_Smelt: -0.0066
Species_Whitefish:  0.2532
R2 Score: 0.865629375972345


### Notes on regularization with upper bound:
With higher upper bounds, the Lasso model behaves similarly to Model-1. However, as the upper bound decreases, the model’s coefficients converge to zero, leading to a reduction in the R2 score.

Notably, at lower degrees of regularization, model solely focuses on Length and Weight of the fish. It drops Length1 and Length3 as they are directly correlated with Length2.

In [25]:
def lasso_loss(params: np.array, X: np.ndarray, y: np.array, alpha: float) -> np.float64:
    return loss_function(params, X, y) + alpha * np.abs(params[1:]).sum()


def test_lasso_loss(alpha):
    params = np.zeros(len(INDEPENDENT_COLS) + 1)
    optimized_params = opt.minimize(lasso_loss,
                                    params,
                                    args=(df[INDEPENDENT_COLS], df[TARGET_COL], alpha))

    coef_names = ["Intercept"] + INDEPENDENT_COLS
    for i, _ in enumerate(optimized_params.x):
        print(f"{coef_names[i]}: {_: .4f}")

    r2_score(df[TARGET_COL], regression_function(optimized_params.x, df[INDEPENDENT_COLS]))
    print(f"R2 Score: {r2_score(df[TARGET_COL], regression_function(optimized_params.x, df[INDEPENDENT_COLS]))}")

In [26]:
test_lasso_loss(0.0001)

Intercept: -717.8178
Length1: -84.1286
Length2:  84.3550
Length3:  31.1058
Height:  2.8763
Width:  5.7868
Rand1: -0.1179
Rand2:  7.2387
Species_Bream: -191.7276
Species_Parkki: -39.9426
Species_Perch: -74.5915
Species_Pike: -414.0414
Species_Roach: -106.1772
Species_Smelt:  226.7437
Species_Whitefish: -124.9400
R2 Score: 0.9372844925315286


In [27]:
test_lasso_loss(1)

Intercept: -823.6037
Length1: -83.9105
Length2:  84.3190
Length3:  30.9335
Height:  2.8846
Width:  5.8250
Rand1: -0.1177
Rand2:  7.2398
Species_Bream: -85.0859
Species_Parkki:  66.1579
Species_Perch:  31.4083
Species_Pike: -307.7351
Species_Roach:  0.0000
Species_Smelt:  332.6263
Species_Whitefish: -18.6174
R2 Score: 0.9372844515168789


In [28]:
test_lasso_loss(100)

Intercept: -785.0376
Length1: -63.6655
Length2:  82.0372
Length3:  13.8685
Height:  3.5581
Width:  9.9021
Rand1: -0.0965
Rand2:  7.3577
Species_Bream: -39.6035
Species_Parkki:  58.6352
Species_Perch:  13.0474
Species_Pike: -295.8772
Species_Roach: -0.0000
Species_Smelt:  303.5750
Species_Whitefish: -6.9215
R2 Score: 0.936902559490131


In [29]:
test_lasso_loss(1000)

Intercept: -676.0313
Length1:  0.0001
Length2:  28.9471
Length3: -0.0055
Height:  11.2022
Width:  28.0081
Rand1: -0.0825
Rand2:  8.6599
Species_Bream: -0.0000
Species_Parkki:  0.0001
Species_Perch: -0.0000
Species_Pike: -148.8133
Species_Roach: -0.0011
Species_Smelt:  212.1972
Species_Whitefish:  0.0000
R2 Score: 0.9284930287951928


In [30]:
test_lasso_loss(15000)

Intercept: -483.1866
Length1:  4.8732
Length2:  21.5221
Length3: -0.0000
Height:  16.2139
Width: -0.0005
Rand1: -0.0590
Rand2:  0.0120
Species_Bream:  0.0013
Species_Parkki: -0.0042
Species_Perch:  0.0047
Species_Pike:  0.0008
Species_Roach: -0.0042
Species_Smelt:  0.0012
Species_Whitefish: -0.0110
R2 Score: 0.874404954360701


### Notes on regularization with Lasso loss:
It worked quite similar to previous Lasso Model. As the alpha value increases, the model's coefficients converge to zero, leading to a reduction in the R2 score. At lower alpha values, the model behaves similarly to Model-1.

Both Lasso models yielded comparable results, with the estimated parameters being quite similar. The Lagrangian Lasso model, however, has a lower intercept, compensated by a higher coefficient on the 'Height' variable.

While we can adjust parameters to achieve similar outcomes using different regularization methods, identical results are not guaranteed. One straightforward scenario where both models yield the same solution is when the Lagrangian multiplier is zero, and the regularization bound is set to a very large value (e.g., 1e10).


# Sklearn Lasso 

In [31]:
from sklearn.linear_model import Lasso

alpha = 15000
sklearn_alpha = alpha / (2 * df[INDEPENDENT_COLS].shape[0])
train_report_model(_model=Lasso, X_train=df[INDEPENDENT_COLS], y_train=df[TARGET_COL], params={"alpha": sklearn_alpha})

R2 Score: 0.8742777387832947
Intercept           -485.822443
Length1                1.023183
Length2               25.211748
Length3                0.000000
Height                16.043920
Width                  0.000000
Rand1                 -0.055302
Rand2                  0.000000
Species_Bream         -0.000000
Species_Parkki        -0.000000
Species_Perch          0.000000
Species_Pike          -0.000000
Species_Roach         -0.000000
Species_Smelt          0.000000
Species_Whitefish      0.000000
Name: 0, dtype: float64


### Sklearn Lasso Summary
Sklearn's Lasso implementation normalizes the Squared Error by **(2 * n_samples)**. Therefore, to compansate for the difference in loss function, we need to divide the alpha value by **(2 * n_samples)** to get the same results as our previous Lasso models.