### 1. Load Data


In [13]:
import pandas as pd
import altair as alt
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split

# Fetch dataset 
abalone = fetch_ucirepo(id=1) 

# Extract features and targets
X = abalone.data.features
y = abalone.data.targets

# Combine into a single DataFrame for easier initial handling
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### 2. Data Wrangling and Cleaning


In [14]:
missing_values = df.isnull().sum()
print("Missing values per column:", missing_values[missing_values > 0])

df.info()

Missing values per column: Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole_weight    4177 non-null   float64
 5   Shucked_weight  4177 non-null   float64
 6   Viscera_weight  4177 non-null   float64
 7   Shell_weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [15]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=522
)

# Create a training DataFrame for EDA to avoid data leakage
train_df = pd.concat([X_train, y_train], axis=1)
print(f"Training set size: {train_df.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 3341
Test set size: 836


### 3. Exploratory Data Analysis (EDA)


In [16]:
# Summary statistics of the training data
train_df.describe().round(2)

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
count,3341.0,3341.0,3341.0,3341.0,3341.0,3341.0,3341.0,3341.0
mean,0.52,0.41,0.14,0.82,0.36,0.18,0.24,9.93
std,0.12,0.1,0.04,0.49,0.22,0.11,0.14,3.25
min,0.08,0.06,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.45,0.35,0.12,0.44,0.18,0.09,0.13,8.0
50%,0.55,0.42,0.14,0.8,0.33,0.17,0.23,9.0
75%,0.62,0.48,0.16,1.14,0.5,0.25,0.32,11.0
max,0.82,0.65,0.52,2.83,1.49,0.76,1.0,29.0


**Table 2**: Summary statistics (count, mean, std, min, max, quartiles) for numerical features in the training set.

#### Visualization

In [17]:
# Correlation matrix heatmap
corr_matrix = train_df.select_dtypes(include=['float64', 'int64']).corr().reset_index()
corr_df = pd.melt(corr_matrix, id_vars='index', var_name='variable2', value_name='correlation')

heatmap = alt.Chart(corr_df).mark_rect().encode(
    x=alt.X('index', title=None),
    y=alt.Y('variable2', title=None),
    color=alt.Color('correlation', scale=alt.Scale(scheme='blueorange', domain=[-1, 1])),
    tooltip=['index', 'variable2', 'correlation']
).properties(
    title='Correlation Matrix',
    width=400,
    height=400
)

heatmap

**Figure 1**: Heatmap showing the correlation between different numerical features and the target variable `Rings`. Darker blue indicates strong positive correlation.

In [18]:
# Scatter plot of Shell Weight vs Rings
scatter = alt.Chart(train_df).mark_circle(opacity=0.5).encode(
    x=alt.X('Shell_weight', title='Shell Weight (g)'),
    y=alt.Y('Rings', title='Rings (Age)'),
    color='Sex'
).properties(
    title='Rings vs Shell Weight by Sex',
    width=500
)
scatter

**Figure 2**: Scatter plot of Shell Weight (x-axis) versus Rings (y-axis), colored by Sex. There is a positive relationship between shell weight and the number of rings.

*Observation*: There is a clear positive correlation between Shell Weight and Rings. The relationship seems somewhat linear but with increasing variance as weight increases.

### 4. Regression Analysis
We will use a Linear Regression model to predict the number of rings. We construct a pipeline that:
1. One-hot encodes the categorical `Sex` feature.
2. Scales the numerical features.
3. Applies Linear Regression.

In [19]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define features
numeric_features = ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']
categorical_features = ['Sex']

# Create preprocessor
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(drop='if_binary'), categorical_features)
)

# Create and fit pipeline
model = make_pipeline(preprocessor, LinearRegression())
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

### 5. Visualization of Results
We evaluate the model by plotting the Predicted vs. Actual values.

In [20]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# Visualization
results_df = pd.DataFrame({
    'Actual': y_test.values.flatten(),
    'Predicted': y_pred.flatten()
})

pred_chart = alt.Chart(results_df).mark_circle(opacity=0.5).encode(
    x=alt.X('Actual', title='Actual Rings'),
    y=alt.Y('Predicted', title='Predicted Rings')
).properties(
    title=f'Actual vs Predicted Rings (R2 = {r2:.2f})',
    width=500,
    height=500
)

line = alt.Chart(pd.DataFrame({'x': [0, 30], 'y': [0, 30]})).mark_line(color='red', strokeDash=[5,5]).encode(
    x='x',
    y='y'
)

pred_chart + line

Mean Squared Error (MSE): 5.4844
R-squared (R2): 0.4427


**Figure 3**: Actual Rings (x-axis) vs Predicted Rings (y-axis). The red dashed line represents perfect prediction. Points below the line indicate over-prediction, while points above indicate under-prediction.