In [232]:
# imports

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

In [233]:
# train_df = pd.read_csv("../data/transformed/train_outliers_removed.csv", index_col=0)
train_df = pd.read_csv("../data/transformed/train_outliers_removed.csv", index_col=0)
original_df = pd.read_csv("../data/transformed/original_outliers_removed.csv")
test_df = pd.read_csv("../data/input/test.csv", index_col=0)

train_df['source'] = 'train'
original_df['source'] = 'original'

# train_df = train_df[train_df.age > 2]

In [234]:
# Handling duplicates
cols = ["length", "diameter", "height", "weight", "shucked_weight", "viscera_weight", "shell_weight", "age"]

df = pd.concat([train_df, original_df])

grp = df.groupby(cols)['source']

df['source_cnt'] = grp.transform(lambda x: len(set(x)))

In [235]:
# Records exists in both train and original
unique_records = df[df['source_cnt'] == 1]
records_to_keep_from_dups = df[(df['source_cnt'] > 1) & (df['source'] == 'original')]


unique_df = pd.concat([unique_records, records_to_keep_from_dups])

df.shape[0] - unique_df.shape[0]

162

In [236]:
grp1 = unique_df.groupby(cols)['sex']

unique_df['sex_cnt'] = grp1.transform(lambda x: len(set(x)))

In [237]:
unique_df1 = unique_df[(unique_df['sex_cnt'] == 1)]

unique_df1 = unique_df1.drop('source', axis = 1)

In [238]:
df.shape[0] - unique_df1.shape[0]

267

## Train Test Split

In [239]:
df = unique_df1.copy()

target = 'age'
features = df.columns.drop(target).tolist()

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Feature Engineering

#### Encode sex values

In [240]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(sparse_output=True, dtype='int')
oh_encoder.fit(X_train[['sex']])

sex_columns = ["sex" + "_" + col for col in oh_encoder.categories_[0]]

X_train[sex_columns] = oh_encoder.transform(X_train[['sex']]).toarray()
X_train = X_train.drop('sex', axis = 1)

X_test[sex_columns] = oh_encoder.transform(X_test[['sex']]).toarray()
X_test = X_test.drop('sex', axis = 1)

#### Feature Scaling

In [241]:
from sklearn.preprocessing import RobustScaler

columns_to_scale = ['length', 'diameter', 'height', 'weight', 'shucked_weight', 'viscera_weight', 'shell_weight']

scaler = RobustScaler()
scaler.fit(X_train[columns_to_scale])


X_train[columns_to_scale] = scaler.transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

## Modelling

In [242]:
def evaluate(model, train, test):    
    X_train, y_train = train
    X_test, y_test = test
    
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)

    train_mae = mean_absolute_error(y_train, y_train_predicted)
    test_mae = mean_absolute_error(y_test, y_test_predicted)

    print(f"Train MAE: {train_mae}")
    print(f"Test MAE: {test_mae}")

### Baseline Model (Without additional Feature Engineering)

In [243]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(max_leaf_nodes=90)

rfr.fit(X_train, y_train)

evaluate(rfr, (X_train, y_train), (X_test, y_test))

Train MAE: 1.413025905088535
Test MAE: 1.4204740968676537


## Predict on Test Unseen

In [244]:
X_test_unseen = test_df.copy()
X_test_unseen.columns = features

X_test_unseen[sex_columns] = oh_encoder.transform(X_test_unseen[['sex']]).toarray()
X_test_unseen = X_test_unseen.drop('sex', axis = 1)

X_test_unseen[columns_to_scale] = scaler.transform(X_test_unseen[columns_to_scale])

ValueError: Length mismatch: Expected axis has 8 elements, new values have 10 elements

In [None]:
from datetime import datetime

model = rfr
model_name = "random_forest_regressor"

now = datetime.now().strftime("%Y%m%d_%H%M%S")

output = pd.read_csv("../data/input/sample_submission.csv", index_col=0)
output['Age'] = model.predict(X_test_unseen)

# output.to_csv(f"../data/output/crab_age_regressor_{now}.csv", index=True)