# 1. Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

import warnings

### Set up

In [None]:
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)

sns.set(style="whitegrid", palette="muted", font_scale=1.1)
pd.plotting.register_matplotlib_converters()
%matplotlib inline

# 2. Load Data

In [None]:
filepath = ""
filepath_test = ""
df = pd.read_csv(filepath)
df_test = pd.read_csv(filepath_test)
df.head()

# 3. Quick Data Check

In [None]:
print(df.shape)
print(df.info())
df.describe().T

# 4. EDA

In [None]:
# Missing values
print(df.isnull().sum())
# Duplicates
print(df.duplicated().sum())

In [None]:
# Correlation
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

# Target distribution
sns.histplot(df["target"], kde=True)
plt.show()

# 5. Data Preprocessing

In [None]:
# Handling missing values
df = df.fillna(df.median())

# Encoding (if categorical columns exist)
df = pd.get_dummies(df, drop_first=True)

### Split

In [None]:
# Features / Target split
X = df.drop("target", axis=1)
y = df["target"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Baseline Models

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(),
    "XGB": XGBRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "LGBM": LGBMRegressor(),
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred),
    }

results_df = pd.DataFrame(results).T
print(results_df)

# 7. Visualization of Results

In [None]:
results_df.plot(kind="bar", figsize=(12, 6))
plt.title("Model Comparison")
plt.ylabel("Error Metrics")
plt.show()