In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('datasets/possum.csv')
print("Dataset Shape:", df.shape)
print("\nFirst few rows of the dataset:")
df.head()

Dataset Shape: (104, 14)

First few rows of the dataset:


Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


### Show columns with missing values

In [119]:
missing_values = df.isnull().sum()
missing_info = pd.DataFrame({'Missing Values': missing_values})
print("Columns with missing values:")
print(missing_info[missing_info['Missing Values'] > 0])

Columns with missing values:
          Missing Values
age                    2
footlgth               1


### Show duplicated rows

In [120]:
duplicated_rows = df[df.duplicated(keep=False)]
print(f"Number of fully duplicated rows: {len(duplicated_rows)}")

match len(duplicated_rows):
    case 0:
        print("\nNo fully duplicated rows found in the dataset.")
    case _:
        print("\nDuplicated rows:")
        print(duplicated_rows.sort_values(by=df.columns.tolist()))

Number of fully duplicated rows: 0

No fully duplicated rows found in the dataset.


### Drop all rows containing missing values

In [121]:
original_shape = df.shape
df.dropna(inplace=True)

print(f"Original dataset shape: {original_shape}")
print(f"Cleaned dataset shape: {df.shape}")
print(f"Number of rows removed: {original_shape[0] - df.shape[0]}")

Original dataset shape: (104, 14)
Cleaned dataset shape: (101, 14)
Number of rows removed: 3


### Show unique population values

In [122]:
unique_populations = df['Pop'].unique()
print(f"Unique populations: {len(unique_populations)} - {unique_populations}")

population_counts = df['Pop'].value_counts()
print("\nPopulation counts:")
print(population_counts)

Unique populations: 2 - ['Vic' 'other']

Population counts:
Pop
other    58
Vic      43
Name: count, dtype: int64


### Drop columns

In [123]:
original_columns = df.columns.tolist()
columns_to_drop = ['case', 'site', 'Pop', 'sex']
df = df.drop(columns=columns_to_drop)

print("Original columns:", original_columns)
print("\nRemaining columns:", df.columns.tolist())
print("\nNew dataset shape:", df.shape)

Original columns: ['case', 'site', 'Pop', 'sex', 'age', 'hdlngth', 'skullw', 'totlngth', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']

Remaining columns: ['age', 'hdlngth', 'skullw', 'totlngth', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']

New dataset shape: (101, 10)


### Show strongest correlations with belly size

In [124]:
belly_correlations = df.corr()['belly'].sort_values(ascending=False)

strongest_positive = belly_correlations[1:2]  # First value is belly with itself
strongest_negative = belly_correlations[-1:]

print("\nStrongest positive correlation with belly size:")
print(strongest_positive)

print("\nStrongest negative correlation with belly size:")
print(strongest_negative)


Strongest positive correlation with belly size:
chest    0.609757
Name: belly, dtype: float64

Strongest negative correlation with belly size:
earconch    0.071309
Name: belly, dtype: float64


### Decide whether to use scaling or not

In [125]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

scale_stats = df[numerical_cols].describe().T
print("Scale statistics for numerical features:")
print(scale_stats[['min', 'max', 'mean', 'std']])

# Calculate the range of each feature
ranges = scale_stats['max'] - scale_stats['min']
print("\nRanges of numerical features:")
print(ranges.sort_values(ascending=False))

# Calculate coefficient of variation (std/mean) for each feature
cv = scale_stats['std'] / scale_stats['mean']
print("\nCoefficient of Variation for each feature:")
print(cv.sort_values(ascending=False))

Scale statistics for numerical features:
           min    max       mean       std
age        1.0    9.0   3.821782  1.915182
hdlngth   82.5  103.1  92.730693  3.518714
skullw    50.0   68.6  56.960396  3.102679
totlngth  75.0   96.5  87.269307  4.196802
taill     32.0   43.0  37.049505  1.971681
footlgth  60.3   77.9  68.398020  4.413502
earconch  41.3   56.2  48.133663  4.060352
eye       12.8   17.8  15.050495  1.058643
chest     22.0   32.0  27.064356  2.020722
belly     25.0   40.0  32.638614  2.727745

Ranges of numerical features:
totlngth    21.5
hdlngth     20.6
skullw      18.6
footlgth    17.6
belly       15.0
earconch    14.9
taill       11.0
chest       10.0
age          8.0
eye          5.0
dtype: float64

Coefficient of Variation for each feature:
age         0.501123
earconch    0.084356
belly       0.083574
chest       0.074664
eye         0.070339
footlgth    0.064527
skullw      0.054471
taill       0.053217
totlngth    0.048090
hdlngth     0.037946
dtype: float64


**Range Analysis:**
- The ranges vary significantly:
- - totlngth has the largest range (21.5)
- - eye has the smallest range (5.0)
- - This is a 4.3x difference between largest and smallest ranges
- Coefficient of Variation (CV) Analysis:
- - age has the highest CV (0.501), indicating high relative variability
- - hdlngth has the lowest CV (0.038), indicating low relative variability
- - This is a 12.8x difference in relative variability

**Scaling would be beneficial for linear regression**

### Split the dataset and show the number of records in each subset

In [126]:
X = df.drop(['totlngth'], axis=1)
y = df['totlngth']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))
print("\nTotal dataset size:", len(df))
print("Train percentage: {:.1f}%".format(len(X_train)/len(df)*100))
print("Test percentage: {:.1f}%".format(len(X_test)/len(df)*100))

Training set size: 80
Test set size: 21

Total dataset size: 101
Train percentage: 79.2%
Test percentage: 20.8%


### Fit MinMaxScaler on train data. Transform train and test data.

In [127]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

### Fit LinearRegression and Ridge on train data.

In [128]:
from sklearn.linear_model import LinearRegression, Ridge

linear_reg = LinearRegression()
ridge_reg = Ridge(random_state=16)

linear_reg.fit(X_train_scaled, y_train)
ridge_reg.fit(X_train_scaled, y_train)

### Get predictions on test data for both models.

In [129]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred_linear = linear_reg.predict(X_test_scaled)
y_pred_ridge = ridge_reg.predict(X_test_scaled)

linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_r2 = r2_score(y_test, y_pred_linear)

ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

print("\nModel Performance on Test Data:")
print(f"Linear Regression - MSE: {linear_mse:.4f}, R2: {linear_r2:.4f}")
print(f"Ridge Regression - MSE: {ridge_mse:.4f}, R2: {ridge_r2:.4f}")


Model Performance on Test Data:
Linear Regression - MSE: 6.6671, R2: 0.6070
Ridge Regression - MSE: 6.2512, R2: 0.6315


### Calculate MAPE for Ridge model.

In [130]:
from sklearn.metrics import mean_absolute_percentage_error

mape_linear = mean_absolute_percentage_error(y_test, y_pred_linear) * 100
mape_ridge = mean_absolute_percentage_error(y_test, y_pred_ridge) * 100

# Display MAPE results
print("Mean Absolute Percentage Error (MAPE):")
print(f"Linear Regression: {mape_linear:.2f}%")
print(f"Ridge Regression: {mape_ridge:.2f}%")

Mean Absolute Percentage Error (MAPE):
Linear Regression: 2.38%
Ridge Regression: 2.26%


### Calculate MAE for LinearRegression model.

In [131]:
from sklearn.metrics import mean_absolute_error

mae_linear = mean_absolute_error(y_test, y_pred_linear)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)

print("Mean Absolute Error (MAE):")
print(f"Linear Regression: {mae_linear:.4f} cm")
print(f"Ridge Regression: {mae_ridge:.4f} cm")

Mean Absolute Error (MAE):
Linear Regression: 2.0090 cm
Ridge Regression: 1.9130 cm
