In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [ ]:
df_train = pd.read_csv('/kaggle/input/ml-olympiad-predicting-earthquake-damage/train.csv')
df_test = pd.read_csv('/kaggle/input/ml-olympiad-predicting-earthquake-damage/test.csv')

print(f'The Train dataset has {df_train.shape[0]} rows and {df_train.shape[1]} columns')
print(f'The Test dataset has {df_test.shape[0]} rows and {df_test.shape[1]} columns')

In [ ]:
#Let's check the samples of data
display('Train:',df_train.tail())
display('Test:',df_test.head())

In [ ]:
def summary(df):
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['Duplicate'] = df.duplicated().sum()
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['avg'] = desc['mean'].values
    summ['std dev'] = desc['std'].values
    summ['top value'] = desc['top'].values
    summ['Freq'] = desc['freq'].values

    return summ

In [ ]:
summary(df_train.drop(columns=["building_id"])).style.background_gradient()

In [ ]:
#Loading the dataset again to revert previously made changed on BMI etc.
df_train = pd.read_csv('/kaggle/input/ml-olympiad-predicting-earthquake-damage/train.csv')
sample_sub = pd.read_csv('/kaggle/input/ml-olympiad-predicting-earthquake-damage/sample_submission.csv')
df_test = pd.read_csv('/kaggle/input/ml-olympiad-predicting-earthquake-damage/test.csv')

In [ ]:
def get_variable_types(dataframe):
    continuous_vars = []
    categorical_vars = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            categorical_vars.append(column)
        else:
            continuous_vars.append(column)

    return continuous_vars, categorical_vars

continuous_vars, categorical_vars = get_variable_types(df_train)
continuous_vars.remove('damage_grade')

In [ ]:
train = df_train.drop(['building_id'], axis=1).drop_duplicates()
test = df_test.drop(['building_id'], axis=1)

train = pd.get_dummies(train, columns=categorical_vars, drop_first=True)
test = pd.get_dummies(test, columns=categorical_vars, drop_first=True)

In [ ]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [ ]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [ ]:
best_params = {
    "objective": "multiclass",          # Objective function for the model
    "metric": "multi_logloss",          # Evaluation metric
    "verbosity": -1,                    # Verbosity level (-1 for silent)
    "boosting_type": "gbdt",            # Gradient boosting type
    "random_state": 63,       # Random state for reproducibility
    "num_class": 3,                     # Number of classes in the dataset
    'learning_rate': 0.01767620281315413,  # Learning rate for gradient boosting
    'n_estimators': 283,                # Number of boosting iterations
    'lambda_l1': 0.009166846589741212,  # L1 regularization term
    'lambda_l2': 0.03161505886302539,   # L2 regularization term
    'max_depth': 8,                    # Maximum depth of the trees
    'colsample_bytree': 0.3325836967251265,  # Fraction of features to consider for each tree
    'subsample': 0.8207060003776292,    # Fraction of samples to consider for each boosting iteration
    'min_child_samples': 45             # Minimum number of data needed in a leaf
}

In [ ]:
lgbm_classifier = LGBMClassifier(**best_params)
lgbm_classifier.fit(X_train, y_train)
y_pred = lgbm_classifier.predict(X_test)

In [ ]:
accuracy_score(y_test, y_pred)

In [ ]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test, y_pred))

In [ ]:
predictions = lgbm_classifier.predict(test)

In [ ]:
submission = pd.read_csv("/kaggle/input/ml-olympiad-predicting-earthquake-damage/sample_submission.csv")
submission["damage_grade"] = predictions
submission.to_csv("submission.csv", index=False)
submission.head()