In [1]:
# !pip install pycaret
# !pip install mplcyberpunk

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import mplcyberpunk
# plt.style.use("cyberpunk")

from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [3]:
train  = pd.read_csv('train.csv')
original = pd.read_csv('abalone.csv')
test  = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [4]:
train

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...,...
90610,90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


## Combining Datasets

In [5]:
train = train.drop(['id'], axis = 1)
train.columns = original.columns
train = pd.concat([train, original], ignore_index=True).reset_index().rename(columns={'index':'id'})
test.columns = train.columns[:-1]
combined = pd.concat([train, test])

In [6]:
combined_dummies = pd.get_dummies(combined['Sex'], prefix='Sex', drop_first=False)
combined = pd.concat([combined, combined_dummies], axis=1)

In [7]:
train = combined.iloc[:len(train)]
test = combined.iloc[len(train):]

combined

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11.0,1,0,0
1,1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11.0,1,0,0
2,2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6.0,0,1,0
3,3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10.0,0,0,1
4,4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60406,151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500,,0,1,0
60407,151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050,,1,0,0
60408,151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650,,0,1,0
60409,151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350,,1,0,0


## EDA

In [8]:
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df)*100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['count'] = desc['count'].values
    summ['mean'] = desc['mean'].values
    summ['std'] = desc['std'].values
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.iloc[0].values
    summ['second value'] = df.iloc[1].values
    summ['third value'] = df.iloc[2].values

    return summ

In [9]:
summary(train)

data shape: (94792, 13)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,max,first value,second value,third value
id,int64,0,0.0,94792,94792.0,47395.5,27364.237696,0.0,94791.0,0,1,2
Sex,object,0,0.0,3,94792.0,,,,,F,F,I
Length,float64,0,0.0,157,94792.0,0.517402,0.118308,0.075,0.815,0.55,0.63,0.16
Diameter,float64,0,0.0,126,94792.0,0.401952,0.098088,0.055,0.65,0.43,0.49,0.11
Height,float64,0,0.0,90,94792.0,0.135643,0.038193,0.0,1.13,0.15,0.145,0.025
Whole weight,float64,0,0.0,3205,94792.0,0.790785,0.459231,0.002,2.8255,0.7715,1.13,0.021
Shucked weight,float64,0,0.0,1806,94792.0,0.341597,0.205267,0.001,1.488,0.3285,0.458,0.0055
Viscera weight,float64,0,0.0,983,94792.0,0.169914,0.101334,0.0005,0.76,0.1465,0.2765,0.003
Shell weight,float64,0,0.0,1132,94792.0,0.226468,0.130639,0.0015,1.005,0.24,0.32,0.005
Rings,float64,0,0.0,28,94792.0,9.707233,3.178704,1.0,29.0,11.0,11.0,6.0


In [10]:
summary(test)

data shape: (60411, 13)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,max,first value,second value,third value
id,int64,0,0.0,60411,60411.0,120820.0,17439.297893,90615.0,151025.0,90615,90616,90617
Sex,object,0,0.0,3,60411.0,,,,,M,M,M
Length,float64,0,0.0,148,60411.0,0.517428,0.117609,0.075,0.8,0.645,0.58,0.56
Diameter,float64,0,0.0,130,60411.0,0.401961,0.09747,0.055,0.65,0.475,0.46,0.42
Height,float64,0,0.0,85,60411.0,0.135751,0.038175,0.0,1.095,0.155,0.16,0.14
Whole weight,float64,0,0.0,3037,60411.0,0.790062,0.457591,0.002,2.8255,1.238,0.983,0.8395
Shucked weight,float64,0,0.0,1747,60411.0,0.341227,0.204221,0.001,1.488,0.6185,0.4785,0.3525
Viscera weight,float64,0,0.0,960,60411.0,0.169419,0.10072,0.0005,0.6415,0.3125,0.2195,0.1845
Shell weight,float64,0,0.0,1089,60411.0,0.226125,0.129826,0.0015,1.004,0.3005,0.275,0.2405
Rings,float64,60411,100.0,0,0.0,,,,,,,


## Feature Engineering

In [11]:
def add_features(df):
    df = df.copy()
    
    df['Whole weight Ratio'] = df['Shucked weight']/df['Whole weight']
    df['Sex'] = df['Sex'].astype('category')
    
    return df

In [12]:
train2 = add_features(train)
test2 = add_features(test)

In [13]:
target = train2['Rings']
train2.drop(columns=['id','Rings','Sex'], inplace=True)
test2.drop(columns=['id','Rings','Sex'], inplace=True)

In [14]:
train2

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M,Whole weight Ratio
0,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,1,0,0,0.425794
1,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,1,0,0,0.405310
2,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,0,1,0,0.261905
3,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,0,0,1,0.410607
4,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,0,1,0,0.472506
...,...,...,...,...,...,...,...,...,...,...,...
94787,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,1,0,0,0.417136
94788,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,0,0,1,0.454451
94789,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,0,0,1,0.446854
94790,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,1,0,0,0.485153


In [16]:
SEED = 27
N_SPLITS = 10
N_REPEATS = 5

## HyperParamter Tuning

In [19]:
# import optuna
# import numpy as np
# import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
# from sklearn.model_selection import train_test_split

# # Define objective function for optimization
# def objective(trial):
#     # Define hyperparameters to be optimized
#     params = {
#         'iterations': trial.suggest_int('iterations', 100, 1000),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
#         'depth': trial.suggest_int('depth', 4, 10),
#         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
#         'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
#         'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10),
#         'border_count': trial.suggest_int('border_count', 32, 255),
#         'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])
#     }

#     # Split the data
#     X_train, X_val, y_train, y_val = train_test_split(train2, target, test_size=0.2, random_state=42)

#     # Fit the CatBoost model
#     model = CatBoostRegressor(**params, verbose=False)
#     model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)

#     # Predict and calculate RMSLE
#     y_pred = model.predict(X_val)
#     rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))

#     return rmsle

# # Optimize hyperparameters using Optuna
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get the best parameters
# best_params = study.best_params
# print("Best parameters:", best_params)

# # Train the model with the best parameters
# best_model = CatBoostRegressor(**best_params)
# best_model.fit(train2, target)

#### Catboost Regressor
- No log transformation
- Optuna HyperTuned
- Best RMSLE : 0.1497832158640954

Best parameters: {'iterations': 934, 'learning_rate': 0.05444261258765246, 'depth': 7, 'l2_leaf_reg': 2.635123463408289, 'random_strength': 1.5429959257899784, 'bagging_temperature': 1.20407627575537, 'border_count': 231, 'grow_policy': 'Depthwise'}


In [20]:
import lightgbm as lgb
import optuna
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_log_error

# # Load train dataset
# train_data = train2

# # Load target dataset
# target_data = target

# # Split data into train and validation sets
# X_train, X_val, y_train, y_val = train_test_split(train_data, target_data, test_size=0.2, random_state=42)

# # Define objective function for Optuna
# def objective(trial):
#     params = {
#         'objective': 'regression',
#         'metric': 'rmsle',
#         'verbosity': -1,
#         'boosting_type': 'gbdt',
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
#         'num_iterations': trial.suggest_int('num_iterations', 50, 1000)
#     }

#     model = lgb.LGBMRegressor(**params)
#     model.fit(X_train, y_train)

#     preds = model.predict(X_val)
#     rmsle = np.sqrt(mean_squared_log_error(y_val, preds))
    
#     return rmsle

# # Create study object and optimize hyperparameters
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get best parameters
# best_params = study.best_params
# print("Best Params:", best_params)

# # Train model with best parameters
# best_model = lgb.LGBMRegressor(**best_params)
# best_model.fit(X_train, y_train)

# # Predict
# predictions = best_model.predict(X_val)
# rmsle = np.sqrt(mean_squared_log_error(y_val, predictions))
# print("RMSLE on validation set:", rmsle)


#### LGBM
- No log transformation on target
- Optuna Hyperparameter Tuned
- Best RMSLE : 0.14965236297868925

{'lambda_l1': 0.0001165563455669194, 'lambda_l2': 4.681924963834088e-07, 'num_leaves': 113, 'feature_fraction': 0.43460388002303074, 'bagging_fraction': 0.9580391806195645, 'bagging_freq': 4, 'min_child_samples': 37, 'learning_rate': 0.021506924754963613, 'num_iterations': 777}.


In [22]:
# import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# import numpy as np

# # Define objective function
# def objective(trial):
#     param = {
#         'objective': 'reg:squaredlogerror',
#         'eval_metric': 'rmsle',
#         'booster': 'gbtree',
#         'verbosity': 0,
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
#         'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
#         'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
#         'tree_method': 'hist',  # to make training faster
#         'random_state': 42  # for reproducibility
#     }
    
#     model = xgb.XGBRegressor(**param)
    
#     # Training
#     model.fit(train2, target)
    
#     # Prediction
#     y_pred = model.predict(train2)
    
#     # Calculate RMSLE
#     rmsle = np.sqrt(mean_squared_error(np.log1p(target), np.log1p(y_pred)))
    
#     return rmsle

# # Create study object and optimize objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Print best parameters and value
# print('Best trial:')
# trial = study.best_trial
# print('  Value: {:.3f}'.format(trial.value))
# print('  Params: ')
# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))


#### XGB
- No log transformation on target
- Optuna Hyperparameter Tuned
- Best RMSLE : 0.1007144936940495

{'n_estimators': 905, 'learning_rate': 0.2992405403071614, 'max_depth': 10, 'subsample': 0.9034383329050202, 'colsample_bytree': 0.8590433169584558, 'lambda': 3.1346614100174675e-08, 'alpha': 0.1318778920589813}


In [27]:
# import optuna
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.metrics import mean_squared_log_error

# # Define objective function for optimization
# def objective(trial):
#     # Split data into train and validation sets
#     X_train, X_val, y_train, y_val = train_test_split(train2, target, test_size=0.2, random_state=42)
    
#     # Define hyperparameters to tune
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'max_features': trial.suggest_uniform('max_features', 0.1, 1.0)
#     }
    
#     # Initialize Gradient Boosting Regressor with parameters from Optuna
#     model = GradientBoostingRegressor(**params, random_state=42)
    
#     # Fit the model
#     model.fit(X_train, y_train)
    
#     # Predict on validation set
#     y_pred = model.predict(X_val)
    
#     # Calculate RMSLE
#     rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
    
#     return rmsle

# # Create Optuna study
# study = optuna.create_study(direction='minimize')

# # Optimize the objective function
# study.optimize(objective, n_trials=100)

# # Print the results
# print('Best trial:')
# best_trial = study.best_trial
# print('  Value: {:.4f}'.format(best_trial.value))
# print('  Params: ')
# for key, value in best_trial.params.items():
#     print('    {}: {}'.format(key, value))


#### GB
- No log transformation on target
- Optuna Hyperparameter Tuned
- Best RMSLE : 0.1494448549316116

{'n_estimators': 982, 'learning_rate': 0.05049075520804626, 'max_depth': 7, 'min_samples_split': 16, 'min_samples_leaf': 4, 'subsample': 0.8431225297813086, 'max_features': 0.5193469242711821}


In [18]:
cb_params = {'iterations': 934, 'learning_rate': 0.05444261258765246, 'depth': 7, 'l2_leaf_reg': 2.635123463408289, 'random_strength': 1.5429959257899784, 'bagging_temperature': 1.20407627575537, 'border_count': 231, 'grow_policy': 'Depthwise', 'verbose':0, 'random_state': SEED}

lgbm_params = {'lambda_l1': 0.0001165563455669194, 'lambda_l2': 4.681924963834088e-07, 'num_leaves': 113, 'feature_fraction': 0.43460388002303074, 'bagging_fraction': 0.9580391806195645, 'bagging_freq': 4, 'min_child_samples': 37, 'learning_rate': 0.021506924754963613, 'num_iterations': 777, 'verbose':-1, 'random_state': SEED}

xgb_params = {'n_estimators': 905, 'learning_rate': 0.2992405403071614, 'max_depth': 10, 'subsample': 0.9034383329050202, 'colsample_bytree': 0.8590433169584558, 'lambda': 3.1346614100174675e-08, 'alpha': 0.1318778920589813, 'random_state': SEED}

gb_params = {'n_estimators': 982, 'learning_rate': 0.05049075520804626, 'max_depth': 7, 'min_samples_split': 16, 'min_samples_leaf': 4, 'subsample': 0.8431225297813086, 'max_features': 0.5193469242711821, 'random_state': SEED}

In [19]:
cb_model = CatBoostRegressor(**cb_params).fit(train2, target)
xgb_model = XGBRegressor(**xgb_params).fit(train2, target)
lgbm_model = LGBMRegressor(**lgbm_params).fit(train2, target)
gb_model = GradientBoostingRegressor(**gb_params).fit(train2, target)



In [20]:
estimators = [
    ('xgb', xgb_model),
    ('cb', cb_model),
    ('lgbm', lgbm_model),
    ('gb', gb_model)
]

In [21]:
weights = [0.25072415, 0.24954255, 0.24983537, 0.24989793]


In [26]:
predictions1 = xgb_model.predict(test2)
predictions2 = cb_model.predict(test2)
predictions3 = lgbm_model.predict(test2)
predictions4 = gb_model.predict(test2)

In [27]:
final_predictions = 0.25072415*predictions1 + 0.24954255*predictions2 + 0.24983537*predictions3 + 0.24989793*predictions4

In [31]:
submission['Rings'] = final_predictions
submission.to_csv("Submission5.csv", header=True, index=False)