**IMPORTING NECESSARY LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

# **EXPLORING DATA** (columns, stats, correlations)

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

train.head()


In [None]:
def check(data):
    check = []
    column=data.columns
    for i in column:
        type_of_data = data[i].dtypes
        unique_vals = data[i].nunique()
        null =data[i].isnull().sum()
        check.append([i, type_of_data, unique_vals, null])
    df_check=pd.DataFrame(check)
    df_check.columns=['column_name', 'type_of_data', 'unique_vals', 'null']
    return df_check

check(train)

In [None]:
train.describe().T

In [None]:
def plot(data, col):
    f,(ax_box, ax_hist) = plt.subplots(2, sharex = True, gridspec_kw = {'height_ratios': (0.15, 0.85)}, figsize = (12,6))
    sns.boxplot(data=data, x=col, ax=ax_box, showmeans=True)
    sns.histplot(data=data, x=col, kde=True, ax=ax_hist)
    plt.show()

In [None]:
for col in train.columns:
    print(col)
    plot(train, col)

In [None]:
# Checking for correlations between features

plt.figure(figsize=(20,10))
sns.heatmap(train.corr(), annot=True, fmt='.1f', cmap='viridis')

# **LINEAR REGRESSION MODEL** 

In [None]:
from sklearn import linear_model
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

Setting Target


In [None]:
train_features = train.drop(['FloodProbability'], axis=1)
train_target = train['FloodProbability']

In [None]:
Training Model

In [None]:
train_features = sm.add_constant(train_features)

ols_model = sm.OLS(train_target,train_features)

ols_result = ols_model.fit()

print(ols_result.summary())

# **LGBM MODEL**

Importing Libraries

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from lightgbm import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree

Splitting the data

In [None]:
X = train_features
y = train_target

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

Setting Parameters

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt', 
    'n_estimators':1500, 
    'learning_rate' :  0.012,    
    'num_leaves' : 250, 
    'subsample_for_bin': 165700, 
    'min_child_samples': 114, 
    'reg_alpha': 2.075e-06, 
    'reg_lambda': 3.839e-07, 
    'colsample_bytree': 0.9634,
    'subsample': 0.9592, 
    'max_depth': 10,
    'random_state':0,
    'verbosity':-1}

lgbm_model = LGBMRegressor(**lgbm_params)
lgbm_model.fit(X_train,y_train)

Error

In [None]:
y_pred = lgbm_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

print(f"RMSE: {rmse: .2f}")
print(f"R2 Score: {r2: .4f}")


Feature Importance Bar Chart

In [None]:
feature_importances = pd.DataFrame({
    "Feature": X_test.columns,
    "Importance": lgbm_model.feature_importances_
})

# Sorting our features
feature_importances = feature_importances.sort_values(by="Importance", ascending = False)

# Plotting the importances
plt.figure(figsize=(20,10))
sns.barplot(x="Importance", y="Feature", data=feature_importances, palette = 'colorblind')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("LGBM Feature Importances")
plt.show()

# **Error Visualization**

In [None]:
plt.figure(figsize=(12, 5))

LGBM Error Visualization

In [None]:
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_lgbm, alpha=0.7, color="blue", label="LGBM Predictions")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--", label="Perfect Fit")
plt.xlabel("Actual Flood Probability")
plt.ylabel("Predicted Flood Probability")
plt.title("LGBM Model - Actual vs Predicted")
plt.legend()
plt.grid(True)

Linear Regression Visualization

In [None]:
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_lr, alpha=0.7, color="green", label="Linear Regression Predictions")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--", label="Perfect Fit")
plt.xlabel("Actual Flood Probability")
plt.ylabel("Predicted Flood Probability")
plt.title("Linear Regression - Actual vs Predicted")
plt.legend()
plt.grid(True)

In [None]:
plt.tight_layout()
plt.show()

# **Residual Visualization**

In [None]:
plt.figure(figsize=(12, 5))

LGBM Residual Visualization

In [None]:
plt.subplot(1, 2, 1)
residuals_lgbm = y_test - y_pred_lgbm
sns.scatterplot(x=y_pred_lgbm, y=residuals_lgbm, alpha=0.7, color="purple")
plt.axhline(0, linestyle="--", color="red")
plt.xlabel("Predicted Flood Probability")
plt.ylabel("Residuals")
plt.title("LGBM Model - Residual Plot")

Linear Regression Residual Visualization

In [None]:
plt.subplot(1, 2, 2)
residuals_lr = y_test - y_pred_lr
sns.scatterplot(x=y_pred_lr, y=residuals_lr, alpha=0.7, color="orange")
plt.axhline(0, linestyle="--", color="red")
plt.xlabel("Predicted Flood Probability")
plt.ylabel("Residuals")
plt.title("Linear Regression - Residual Plot")

In [None]:
plt.tight_layout()
plt.show()