# Modelling

This notebook aims to model the age of abalone from their physical characteristics.

In [1]:
import pandas as pd
import altair as alt
from helpers import LinearRegressionExperiment

In [2]:
# First lets load our data
df = pd.read_parquet('data/abalone_processed.parquet')

# Split our data into features and labels
df_X = df.drop(columns=['rings'])
df_y = df['rings']

1. Develop a linear regression model using all features for ring-age using 60 percent of data picked randomly for training and remaining for testing. Visualise your model prediction using appropriate plots. Report the RMSE and R-squared score. (4 Marks)

In [3]:
# Instantiate the experiment
experiment = LinearRegressionExperiment(
  df_X=df_X,
  df_y=df_y,
  experiment_number=42
)

# Fit, predict and evaluate
experiment.fit_predict_evaluate()

# Report our RMSE and R-squared
print(experiment.evaluation)

Training Metrics: [RMSE: 4.893183983114856, R-squared: 0.5429807974750387]
Test Metrics: [RMSE: 4.767145362532978, R-squared: 0.5182048268987614]


In [4]:
# Now we've fit a model lets visualise the results

df_train_predictions = pd.DataFrame({
  'actuals': experiment.y_train,
  'predictions': experiment.y_train_pred
})

df_testing_predictions = pd.DataFrame({
  'actuals': experiment.y_test,
  'predictions': experiment.y_test_pred
})

# Create an actual vs prediction scatter plot for our training data
train_actual_prediction = alt.Chart(df_train_predictions, title="Training Rings Actual vs Rings Prediction").mark_circle(opacity=0.2).encode(
  x=alt.X('actuals', title="Rings Actuals"),
  y=alt.Y('predictions', title="Rings Predictions"),
  color=alt.ColorValue('#7BB2D9'),
).properties(
  width=480,
  height=300
)

# Create a line of best fit for our training data
train_line_of_best_fit = train_actual_prediction.transform_regression(
  'actuals',
  'predictions',
  method="linear"
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

# Create an actual vs prediction scatter plot for our testing data
test_actual_prediction = alt.Chart(df_testing_predictions, title="Testing Rings Actual vs Rings Prediction").mark_circle(opacity=0.2).encode(
  x=alt.X('actuals', title="Rings Actuals"),
  y=alt.Y('predictions', title="Rings Predictions"),
  color=alt.ColorValue('#D62828'),
).properties(
  width=480,
  height=300
)

# Create a line of best fit for our testing data
testing_line_of_best_fit = test_actual_prediction.transform_regression(
  'actuals',
  'predictions',
  method="linear"
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)


actual_prediction_seperate = (train_actual_prediction + train_line_of_best_fit) | (test_actual_prediction + testing_line_of_best_fit)
actual_prediction_seperate.save('assets/actual_prediction_seperate.png', ppi=100)
actual_prediction_seperate

2. Develop a linear regression model with all input features
   - ~~without normalising input data~~ (done in the section above)
   - with normalising input data. (2 Marks)

In [5]:
# Lets now run an experiment with data normalised using a standard scaler
experiment = LinearRegressionExperiment(
  df_X=df_X,
  df_y=df_y,
  experiment_number=42
)

# Fit, predict and evaluate
experiment.fit_predict_evaluate(normalise_X=True)

# Report our RMSE and R-squared
print(experiment.evaluation)

Training Metrics: [RMSE: 4.893183983114856, R-squared: 0.5429807974750387]
Test Metrics: [RMSE: 4.767145362532981, R-squared: 0.5182048268987611]


3. Develop a linear regression model with two selected input features from the data processing step. (2 Marks)

In [6]:
# Lets create a linear regression model with our 2 most correlated features
# Recalling from before these where `shell_weight` and `diameter`

# Lets arrange a dataframe with only thes 2 features
df_X_2_features = df_X[['shell_weight', 'diameter']]

experiment = LinearRegressionExperiment(
  df_X=df_X_2_features,
  df_y=df_y,
  experiment_number=42
)

# Fit, predict and evaluate
experiment.fit_predict_evaluate()

# Report our RMSE and R-squared
print(experiment.evaluation)

Training Metrics: [RMSE: 6.511299050343968, R-squared: 0.39185023296519517]
Test Metrics: [RMSE: 5.987007789295018, R-squared: 0.394918502617424]


4. In each of the above investigations, run 30 experiments each and report the mean and std of the RMSE and R-squared score of the train and test datasets. Write a paragraph to compare your results of the different approaches taken. Note that if your code can't work for 30 experiments, only 1 experiment run is fine. You won't be penalised if you just do 1 experiment run. (2 Marks)

In [7]:
# We want to store the results
results_df = pd.DataFrame()

# We want to repeat the above investigations 30 times
for experiment_number in range(1, 31):
  # First lets create our experiment with all features
  all_features = LinearRegressionExperiment(
    df_X=df_X,
    df_y=df_y,
    experiment_number=experiment_number
  )

  # Next lets create our experiment with normalised features
  all_features_normalised = LinearRegressionExperiment(
    df_X=df_X,
    df_y=df_y,
    experiment_number=experiment_number
  )

  # Finally lets create our experiment with only 2 features
  two_features = LinearRegressionExperiment(
    df_X=df_X_2_features,
    df_y=df_y,
    experiment_number=experiment_number
  )

  # Now lets fit, predict and evaluate all 3 experiments
  all_features.fit_predict_evaluate()
  all_features_normalised.fit_predict_evaluate(normalise_X=True)
  two_features.fit_predict_evaluate()

  # Now lets store the results
  results_df = pd.concat([
    results_df,
    all_features.get_evaluation_as_pd('all_features'),
    all_features_normalised.get_evaluation_as_pd('all_features_normalised'),
    two_features.get_evaluation_as_pd('two_features'),
  ])

In [8]:
# Lets report the mean and standard deviation of our results
aggregated_df = results_df.groupby(['experiment', 'split']).agg(
  rmse_mean=('rmse', 'mean'),
  rmse_std=('rmse', 'std'),
  rsquared_mean=('rsquared', 'mean'),
  rsquared_std=('rsquared', 'std'),
).reset_index()

aggregated_df.sort_values(by=['experiment', 'split'], ascending=[True, False])

Unnamed: 0,experiment,split,rmse_mean,rmse_std,rsquared_mean,rsquared_std
1,all_features,train,4.777105,0.150145,0.538882,0.012411
0,all_features,test,4.984308,0.241033,0.521976,0.023865
3,all_features_normalised,train,4.777105,0.150145,0.538882,0.012411
2,all_features_normalised,test,4.984308,0.241033,0.521976,0.023865
5,two_features,train,6.266308,0.201212,0.395219,0.013043
4,two_features,test,6.353889,0.304369,0.39105,0.01954


In [9]:
# Get the results for each experiment
all_features_results_df = results_df.where(results_df['experiment']=='all_features').dropna()
all_features_normalised_results_df = results_df.where(results_df['experiment']=='all_features_normalised').dropna()
two_features_results_df = results_df.where(results_df['experiment']=='two_features').dropna()

In [10]:
# Create our RMSE charts
rmse_x = alt.X('experiment_number', title="Experiment Number")
rmse_y = alt.Y('rmse', title="RMSE", scale=alt.Scale(domain=[0.0, 10.0]))
rmse_color = alt.Color(
  'split',
  scale=alt.Scale(domain=['train', 'test'], range=['#7BB2D9', '#D62828']),
  legend=alt.Legend(title="Split")
)

# Plot RMSE for all features
all_features_rmse_chart = alt.Chart(all_features_results_df, title='All Features Experiment').mark_line().encode(
  x=rmse_x,
  y=rmse_y,
  color=rmse_color,
).properties(
  width=480,
  height=300
)
all_features_rmse_trend_chart = all_features_rmse_chart.transform_regression('experiment_number',
  'rmse',
  method='linear'
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

# Plot RMSE for all features normalised
all_features_normalised_rmse_chart = alt.Chart(all_features_normalised_results_df, title='All Features Normalised Experiment').mark_line().encode(
  x=rmse_x,
  y=rmse_y,
  color=rmse_color,
).properties(
  width=480,
  height=300
)
all_features_normalised_rmse_trend_chart = all_features_normalised_rmse_chart.transform_regression('experiment_number',
  'rmse',
  method='linear'
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

# Plot RMSE for two features
two_features_rmse_chart = alt.Chart(two_features_results_df, title='Two Features Experiment').mark_line().encode(
  x=rmse_x,
  y=rmse_y,
  color=rmse_color,
).properties(
  width=480,
  height=300
)
two_features_rmse_trend_chart = two_features_rmse_chart.transform_regression('experiment_number',
  'rmse',
  method='linear'
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

all_features_rmse_chart_plus_trend = (all_features_rmse_chart + all_features_rmse_trend_chart)
all_features_normalised_rmse_chart_plus_trend = (all_features_normalised_rmse_chart + all_features_normalised_rmse_trend_chart)
two_features_rmse_chart_plus_trend = (two_features_rmse_chart + two_features_rmse_trend_chart)

all_rmse_charts = all_features_rmse_chart_plus_trend | all_features_normalised_rmse_chart_plus_trend | two_features_rmse_chart_plus_trend
all_rmse_charts.save('assets/all_rmse_charts.png', ppi=100)
all_rmse_charts

In [11]:
# Create our R-squared charts
rsquared_x = alt.X('experiment_number', title="Experiment Number")
rsquared_y = alt.Y('rsquared', title="R-squared", scale=alt.Scale(domain=[0.0, 1.0]))
rsquared_color = alt.Color(
  'split',
  scale=alt.Scale(domain=['train', 'test'], range=['#7BB2D9', '#D62828']),
  legend=alt.Legend(title="Split")
)

# Plot R-squared for all features
all_features_rsquared_chart = alt.Chart(all_features_results_df, title='All Features Experiment').mark_line().encode(
  x=rsquared_x,
  y=rsquared_y,
  color=rsquared_color,
).properties(
  width=480,
  height=300
)
all_features_rsquared_trend_chart = all_features_rsquared_chart.transform_regression('experiment_number',
  'rsquared',
  method='linear'
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

# Plot RMSE for all features normalised
all_features_normalised_rsquared_chart = alt.Chart(all_features_normalised_results_df, title='All Features Normalised Experiment').mark_line().encode(
  x=rsquared_x,
  y=rsquared_y,
  color=rsquared_color,
).properties(
  width=480,
  height=300
)
all_features_normalised_rsquared_trend_chart = all_features_normalised_rsquared_chart.transform_regression('experiment_number',
  'rsquared',
  method='linear'
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

# Plot RMSE for two features
two_features_rsquared_chart = alt.Chart(two_features_results_df, title='Two Features Experiment').mark_line().encode(
  x=rsquared_x,
  y=rsquared_y,
  color=rsquared_color,
).properties(
  width=480,
  height=300
)
two_features_rsquared_trend_chart = two_features_rsquared_chart.transform_regression('experiment_number',
  'rsquared',
  method='linear'
).mark_line(strokeWidth=2.5).encode(
  color=alt.ColorValue('#FFDC00'),
  opacity=alt.value(1),
)

all_features_rsquared_chart_plus_trend = (all_features_rsquared_chart + all_features_rsquared_trend_chart)
all_features_normalised_rsquared_chart_plus_trend = (all_features_normalised_rsquared_chart + all_features_normalised_rsquared_trend_chart)
two_features_rsquared_chart_plus_trend = (two_features_rsquared_chart + two_features_rsquared_trend_chart)

all_rsquared_charts = all_features_rsquared_chart_plus_trend | all_features_normalised_rsquared_chart_plus_trend | two_features_rsquared_chart_plus_trend
all_rsquared_charts.save('assets/all_rsquared_charts.png', ppi=100)
all_rsquared_charts