### Imports and Loading data

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
tf.experimental.numpy.experimental_enable_numpy_behavior()

In [2]:
modelslist = [] # Array to hold successful models for ensemble/bag methods
traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')

In [None]:
traindf.head() # Viewing sample rows of data

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


### Preprocessing

In [None]:
traindf.drop(['id'], axis=1, inplace=True) # Eliminating redundant id column
testdf = testdf.set_index('id')

# One-hot encoding the sex column in train and test data
traindf['Sex'] = traindf['Sex'].map({'F': 'Female', 'I': 'Idk', 'M': 'Male'}) # 
traindf = pd.get_dummies(traindf, columns=['Sex'], prefix='', prefix_sep='', dtype=np.float32)

testdf['Sex'] = testdf['Sex'].map({'F': 'Female', 'I': 'Idk', 'M': 'Male'})
testdf = pd.get_dummies(testdf, columns=['Sex'], prefix='', prefix_sep='', dtype=np.float32)

# Defining the target for the model
target = traindf['Rings']
traindf.drop(['Rings'], axis=1, inplace=True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(traindf, target, test_size=0.2, random_state=69) # Creating validation subset

## Utility Functions

In [None]:
def rmsle(y_true, y_pred): 
    """
    Computes the Root Mean Squared Logarithmic Error (RMSLE) between the true and predicted values.

    Parameters:
    y_true (array-like): Array of true values.
    y_pred (array-like): Array of predicted values.

    Returns:
    tf.Tensor: The RMSLE value as a TensorFlow tensor.
    """
    return tf.sqrt(tf.reduce_mean((tf.math.log(1+y_pred.astype('float32'))-tf.math.log(1+y_true.astype('float32')))**2))

In [None]:
def get_submission(model, testdf):
    """
    Generates a submission file for a Kaggle competition.
    This function takes a trained model and a test DataFrame, makes predictions,
    and outputs a CSV file named 'submission.csv' with the predictions.
    Args:
        model: The trained model used for making predictions.
        testdf (pd.DataFrame): The test DataFrame containing the features for prediction.
    """
    predictions = model.predict(testdf)
    rounded_predictions = predictions
    outputdf = pd.DataFrame(rounded_predictions, columns=['Rings'])
    outputdf['id'] = testdf.index

    cols = list(outputdf.columns)
    cols.reverse()
    outputdf = outputdf[cols]
    outputdf.to_csv('submission.csv', index=False)
    

## XGBoost Regression

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
def evaluate_xgb_model(xgb_model, X_val, y_val):
    """
    Evaluates one XGBoost model using Root Mean Squared Logarithmic Error (RMSLE).
    Parameters:
    xgb_model (xgboost.Booster): Trained XGBoost model to be evaluated.
    X_val (pd.DataFrame or np.ndarray): Validation features.
    y_val (pd.Series or np.ndarray): True values for the validation set.
    Returns:
    None
    Prints:
    - Sum of NaN values in the logarithm of the true values (y_test_log).
    - Sum of NaN values in the logarithm of the predicted values (y_pred_log).
    - Root Mean Squared Logarithmic Error (RMSLE) of the predictions.
    """
    y_pred = xgb_model.predict(X_val)
    y_pred[y_pred < 0] = 0
    y_test_log = np.log1p(y_val) 
    y_pred_log = np.log1p(y_pred)

    print("Sum of NaN values in y_test_log:", np.isnan(y_test_log).sum())
    print("Sum of NaN values in y_pred_log:", np.isnan(y_pred_log).sum())

    rmsle = root_mean_squared_error(y_test_log, y_pred_log) 
    print("RMSLE:", rmsle)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [340],
    'max_depth': [6],
    'subsample': [0.95],
    'eta': [0.2],
    'alpha': [0.1],
    'booster': ['gbtree']
}

# Initialize the XGBoost regressor with the specified objective
xgbr = xgb.XGBRegressor(objective='reg:squaredlogerror')

# Initialize GridSearchCV with the XGBoost regressor, parameter grid, and other settings
grid_search = GridSearchCV(xgbr, param_grid, scoring='neg_root_mean_squared_log_error', cv=5, verbose=2, n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(traindf, target)

# Print the best parameters and score from the grid search
print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[CV] END alpha=0.1, booster=gbtree, eta=0.2, max_depth=6, n_estimators=340, subsample=0.95; total time=   2.1s
[CV] END alpha=0.1, booster=gbtree, eta=0.2, max_depth=6, n_estimators=340, subsample=0.95; total time=   2.1s
[CV] END alpha=0.1, booster=gbtree, eta=0.2, max_depth=6, n_estimators=340, subsample=0.95; total time=   2.1s
[CV] END alpha=0.1, booster=gbtree, eta=0.2, max_depth=6, n_estimators=340, subsample=0.95; total time=   2.1s
[CV] END alpha=0.1, booster=gbtree, eta=0.2, max_depth=6, n_estimators=340, subsample=0.95; total time=   2.1s
Best Params: {'alpha': 0.1, 'booster': 'gbtree', 'eta': 0.2, 'max_depth': 6, 'n_estimators': 340, 'subsample': 0.95}
Best Score: -0.14905989556212912


In [None]:
# Fit the XGBoost model to the training data
xgbr.fit(X_train, y_train)

# Evaluate the XGBoost model on the validation set
evaluate_xgb_model(xgbr, X_val, y_val)

# Evaluate the best model from GridSearchCV on the validation set
evaluate_xgb_model(best_model, X_val, y_val)

# Append the best model to the models list for ensemble methods
modelslist.append(best_model)

# Generate submission file using the best model
get_submission(best_model, testdf)

Sum of NaN values in y_test_log: 0
Sum of NaN values in y_pred_log: 0
RMSLE: 0.15699401727509213




## LGBM Model

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500, 'num_leaves': 31}
RMSLE: 0.152062916732084

In [None]:
import lightgbm as lgb

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [500, 600, 700],
    'learning_rate': [0.08],
    'num_leaves': [43],
    'max_depth': [5],
    'min_data_in_leaf': [19],
    'lambda_l1': [0.1, 0.2],
    'lambda_l2': [0.1, 0.2],
    'min_gain_to_split': [0.1, 0.2],
    'bagging_fraction': [0.9, 0.85],
    'bagging_freq': [3, 5],
    'feature_fraction': [1],
    "colsample_bytree": [0.7],
}

# Initialize the LightGBM regressor with the specified objective
model = lgb.LGBMRegressor(n_jobs=-1, objective='regression')

# Initialize GridSearchCV with the LightGBM regressor, parameter grid, and other settings
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_log_error')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score from the grid search
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

In [None]:
get_submission(best_model, testdf) # Getting submission

param_grid = {
    'n_estimators': [500],
    'learning_rate': [0.1],
    'num_leaves': [31],
    'max_depth': [5],
    'min_data_in_leaf': [19],
    'lambda_l1': [0.1],
    'lambda_l2': [0.1],
    'min_gain_to_split': [0.1],
    'bagging_fraction': [0.9],
    'bagging_freq': [3],
    'feature_fraction': [0.9],
    'max_bin': [255],
    "colsample_bytree": [0.7],
}
0.15186426150275126

Best Parameters: {'bagging_fraction': 0.9, 'bagging_freq': 3, 'colsample_bytree': 0.7, 'feature_fraction': 0.9, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': 5, 'min_data_in_leaf': 19, 'min_gain_to_split': 0.1, 'n_estimators': 500, 'num_leaves': 31}


## Finalising Stuff

In [None]:
def vote_score(modelslist, y_val, X_val):
    """
    Aggregates predictions from multiple models using a voting mechanism and calculates the Root Mean Squared Logarithmic Error (RMSLE).
    Parameters:
    modelslist (list): A list of trained models that support the predict method.
    y_val (array-like): The true values for the validation set.
    X_val (array-like): The input features for the validation set.
    Returns:
    None: Prints the RMSLE and the sum of NaN values in the log-transformed true and predicted values.
    """
    predictionslist = []
    for model in modelslist:
        predictions = model.predict(X_val)
        rounded_predictions = np.round(predictions).astype(int)
        rounded_predictions = rounded_predictions.flatten()
        predictionslist.append(rounded_predictions)
    
    finaloutput = []
    for index, prediction in enumerate(predictionslist[0]):
        if predictionslist[1][index] == predictionslist[2][index] == predictionslist[0][index]:
            finaloutput.append(prediction)
        elif predictionslist[0][index] == predictionslist[1][index]:
            finaloutput.append(predictionslist[0][index])
        elif predictionslist[0][index] == predictionslist[2][index]:
            finaloutput.append(predictionslist[0][index])
        elif predictionslist[1][index] == predictionslist[2][index]:
            finaloutput.append(predictionslist[1][index])
        else:
            roundedoutput = np.round(np.mean([predictionslist[0][index], predictionslist[1][index], predictionslist[2][index]])).astype(int)
            finaloutput.append(roundedoutput)
    
    y_test_log = np.log1p(y_val)
    y_pred_log = np.log1p(finaloutput)

    print("Sum of NaN values in y_test_log:", np.isnan(y_test_log).sum())
    print("Sum of NaN values in y_pred_log:", np.isnan(y_pred_log).sum())

    rmsle = root_mean_squared_error(y_test_log, y_pred_log) 
    print("RMSLE:", rmsle)