In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from common.eval import cross_val_rmse
from common.dataset import load_abalones, load_info
from common.api import describe_fields


np.random.seed = 70

In [3]:
data = load_abalones()
print(load_info())


 1. Title of Database: Abalone data

2. Sources:

   (a) Original owners of database:
	Marine Resources Division
	Marine Research Laboratories - Taroona
	Department of Primary Industry and Fisheries, Tasmania
	GPO Box 619F, Hobart, Tasmania 7001, Australia
	(contact: Warwick Nash +61 02 277277, wnash@dpi.tas.gov.au)

   (b) Donor of database:
	Sam Waugh (Sam.Waugh@cs.utas.edu.au)
	Department of Computer Science, University of Tasmania
	GPO Box 252C, Hobart, Tasmania 7001, Australia

   (c) Date received: December 1995


3. Past Usage:

   Sam Waugh (1995) "Extending and benchmarking Cascade-Correlation", PhD
   thesis, Computer Science Department, University of Tasmania.

   -- Test set performance (final 1044 examples, first 3133 used for training):
	24.86% Cascade-Correlation (no hidden nodes)
	26.25% Cascade-Correlation (5 hidden nodes)
	21.5%  C4.5
	 0.0%  Linear Discriminate Analysis
	 3.57% k=5 Nearest Neighbour
      (Problem encoded as a classification task)

   -- Data set sa

### Preprocess

In [4]:
# remove the found high correlated features when doing the analysis
cols_remove = ['length', 'height', 'shucked_weight', 'viscera_weight', 'shell_weight']
dropped = data.drop(cols_remove, axis=1)
# generate dummies for Sex
dropped = pd.get_dummies(dropped)
print("resulting columns:", ", ".join(dropped.columns))

resulting columns: diameter, whole_weight, rings, sex_F, sex_I, sex_M


In [5]:
describe_fields(dropped, 'Ridge Rounded')

In [6]:
# define pipeline
X = dropped.drop(['rings'], axis=1)
y = dropped['rings']

# normalization
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


In [7]:
grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1, 2, 3],
    'normalize': [True, False],
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }

search = GridSearchCV(Ridge(), grid, cv=10)
search.fit(X, y)
best_params = search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'alpha': 0.5, 'normalize': True, 'solver': 'sag'}


In [8]:
model = Ridge(**best_params)
scores = cross_val_rmse(model, X, y, cv=10)
print(f"Mean score {np.mean(scores)}")

Mean score 2.579431789183267


In [9]:
model = Ridge(**best_params)
X_train, X_test, y_train, y_test = train_test_split(X, y)
model.fit(X_train, y_train)


Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='sag', tol=0.001)

In [10]:
predicts = np.around(model.predict(X_test))
results = pd.DataFrame({'true': y_test, 'predicted': predicts})
results.head()

Unnamed: 0,predicted,true
1860,9.0,10
1363,11.0,10
4123,9.0,8
3259,12.0,18
4113,9.0,8


In [11]:
def diff_error(y, yhat):
    "An error calculated by the difference of the rounded predicted age to the actual value"
    diff = y - yhat
    return np.sqrt(np.mean(np.square(diff)))

In [12]:
msg = f"""
The mean difference from the original target rings,
using the rounded transformation of the predicted
value is: {diff_error(y_test, predicts)}

"""

print(msg)


The mean difference from the original target rings,
using the rounded transformation of the predicted
value is: 2.5415212262620726




### Experiment Conclusion

We modeled the experiment as a regression task with a Ridge regressor, using the features Sex (One Hot Encoding), Diameter, and Whole Weight targeting the Rings amount. **The result is far from accurate enought** to preserve the infant ones, with a mean error of 2.5 using the rounded prediction we have an error of about 3.75 years.