## Linear Regression with Abalone Data

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo 

### Load the Dataset

In [2]:
abalone = fetch_ucirepo(id=1) 

In [3]:
df_abalone = pd.concat([abalone['data']['features'], abalone['data']['targets']], axis=1)
df_abalone

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


### Select Features

In [4]:
feature_list = ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']
X = df_abalone[feature_list].to_numpy()
y = df_abalone['Rings'].to_numpy()

### Train a Model

#### Standard Linear Regression Model

Separate train and test set.

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(X) * 0.2), random_state=1)

Train the model.

In [25]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [26]:
model.fit(X_train, y_train)

In [27]:
print('Weights: ', model.coef_)
print('Bias / y-intercept:', model.intercept_)

Weights:  [ -2.02362399  14.26698806  10.47550194   9.02503817 -20.3088095
  -9.04118186   8.99273373]
Bias / y-intercept: 2.990369619334876


Training Performance

In [28]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_train)
rsme = mean_squared_error(y_train, y_pred, squared=False)
rsme

2.216643658243015

Test Performance

In [29]:
y_pred = model.predict(X_test)
rsme = mean_squared_error(y_test, y_pred, squared=False)
rsme

2.2144755057577075

#### With Normalization

Normalize and split the train and test set.

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(X) * 0.2), random_state=1)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Train the model.

In [43]:
model = LinearRegression()
model.fit(X_train, y_train)
print('Weights: ', model.coef_)
print('Bias / y-intercept:', model.intercept_)

Weights:  [-0.24411546  1.42231214  0.44871225  4.44486571 -4.50809142 -0.99014766
  1.26262388]
Bias / y-intercept: 9.932375822860568


Training performance

In [44]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_train)
rsme = mean_squared_error(y_train, y_pred, squared=False)
rsme

2.216643658243015

Testing performance

In [45]:
y_pred = model.predict(X_test)
rsme = mean_squared_error(y_test, y_pred, squared=False)
rsme

2.214475505757707

#### More Complex Linear Regression Model

Normalize and add polynomial features.

In [76]:
from sklearn.preprocessing import PolynomialFeatures

scaler = StandardScaler()
poly = PolynomialFeatures(2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(X) * 0.2), random_state=1)

scaler.fit(X_train)
poly.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

Train the model.

In [77]:
model = LinearRegression()
model.fit(X_train, y_train)
print('Weights: ', model.coef_)
print('Bias / y-intercept:', model.intercept_)

Weights:  [-1.84232556e-15 -1.27046530e+00  7.38166739e-01  5.73565865e-01
  6.03450886e+00 -5.62735495e+00 -6.77963690e-01  1.83687543e+00
 -8.43902896e-01  1.64803434e+00 -7.22736706e-01  1.55842029e+00
  6.79199133e-01 -1.93736478e+00 -7.60909974e-02 -1.45589387e+00
  9.92424990e-01  2.75221654e+00 -1.04646444e+00  2.15166968e-02
 -1.59248134e+00 -2.26983397e-02 -2.91462913e-01 -2.90848745e-02
 -4.51790325e-01  3.91072648e-01 -8.67682601e-01 -4.00553085e+00
  4.70273678e-01  7.76532017e-01  2.50974795e+00  6.32245537e-01
  1.85508112e-01  2.97620915e-01  1.13984239e-01 -2.59634127e-01]
Bias / y-intercept: 10.425146982197111


Training performance

In [78]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_train)
rsme = mean_squared_error(y_train, y_pred, squared=False)
rsme

2.099786339068065

Test performance

In [79]:
y_pred = model.predict(X_test)
rsme = mean_squared_error(y_test, y_pred, squared=False)
rsme

2.0726590817500776

#### More Complex Linear Regression Model with Regularization

Normalize and add polynomial features.

In [105]:
from sklearn.preprocessing import PolynomialFeatures

scaler = StandardScaler()
poly = PolynomialFeatures(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(X) * 0.2), random_state=1)

scaler.fit(X_train)
poly.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

Train the model.

In [106]:
from sklearn.linear_model import Lasso, Ridge

model = Lasso(alpha=0.01)
model.fit(X_train, y_train)
print('Weights: ', model.coef_)
print('Bias / y-intercept:', model.intercept_)

Weights:  [ 0.          0.          1.13808761  0.44060755  2.95922934 -3.83655051
 -0.53999086  1.71572545]
Bias / y-intercept: 9.932375822860566


Training performance

In [107]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_train)
rsme = mean_squared_error(y_train, y_pred, squared=False)
rsme

2.222085718396017

Test performance

In [108]:
y_pred = model.predict(X_test)
rsme = mean_squared_error(y_test, y_pred, squared=False)
rsme

2.222578637393552