train,val,test split

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
df = pd.read_csv("final_train.csv")

#Split into train + temp (which will later be split into val/test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

#Split temp into validation and test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check sizes
print(f"Train: {len(train_df)} rows")
print(f"Validation: {len(val_df)} rows")
print(f"Test: {len(test_df)} rows")

Train: 97542 rows
Validation: 32514 rows
Test: 32514 rows


In [5]:
train_df.to_csv("dataset/data_split/train.csv", index=False)
val_df.to_csv("dataset/data_split/validate.csv", index=False)
test_df.to_csv("dataset/data_split/test.csv", index=False)

In [6]:
# Define target column name
target_col = "LOG_RESALE_PRICE"

# Split into X (features) and y (target)
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_val = val_df.drop(columns=[target_col])
y_val = val_df[target_col]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Val shapes:   X={X_val.shape}, y={y_val.shape}")
print(f"Test shapes:  X={X_test.shape}, y={y_test.shape}")

Train shapes: X=(97542, 36), y=(97542,)
Val shapes:   X=(32514, 36), y=(32514,)
Test shapes:  X=(32514, 36), y=(32514,)


model training

In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

def evaluate_model(model, X_val, y_val, X_test, y_test, name="Model"):
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    print(f"\n{name} Evaluation:")
    print(f"Validation RMSE: {np.sqrt(mean_squared_error(y_val, val_pred)):.4f}")
    print(f"Test RMSE:       {np.sqrt(mean_squared_error(y_test, test_pred)):.4f}")
    print(f"Test MAE:        {mean_absolute_error(y_test, test_pred):.4f}")
    print(f"Test R²:         {r2_score(y_test, test_pred):.3f}")

Linear models

In [16]:
# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train, y_train)
evaluate_model(lr, X_val, y_val, X_test, y_test, "Linear Regression")


Linear Regression Evaluation:
Validation RMSE: 0.1104
Test RMSE:       0.1112
Test MAE:        0.0862
Test R²:         0.896


In [17]:
# --- Ridge Regression (with tuning) ---
ridge_params = {'alpha': [0.1, 1, 10, 50]}
ridge = GridSearchCV(Ridge(), ridge_params, cv=3, scoring='neg_root_mean_squared_error')
ridge.fit(X_train, y_train)
evaluate_model(ridge.best_estimator_, X_val, y_val, X_test, y_test, "Ridge Regression")


Ridge Regression Evaluation:
Validation RMSE: 0.1104
Test RMSE:       0.1112
Test MAE:        0.0862
Test R²:         0.896


In [18]:
# --- Lasso Regression (with tuning) ---
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1]}
lasso = GridSearchCV(Lasso(max_iter=10000), lasso_params, cv=3, scoring='neg_root_mean_squared_error')
lasso.fit(X_train, y_train)
evaluate_model(lasso.best_estimator_, X_val, y_val, X_test, y_test, "Lasso Regression")


Lasso Regression Evaluation:
Validation RMSE: 0.1125
Test RMSE:       0.1139
Test MAE:        0.0879
Test R²:         0.891


RandomForest regressor

In [29]:
# --- Random Forest (Bagging) ---
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}
rf = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1),
                  rf_params, cv=3, scoring='neg_root_mean_squared_error')
rf.fit(X_train, y_train)
evaluate_model(rf.best_estimator_, X_val, y_val, X_test, y_test, "Random Forest")


Random Forest Evaluation:
Validation RMSE: 0.0570
Test RMSE:       0.0576
Test MAE:        0.0423
Test R²:         0.972


XGBoost regressor

In [33]:
# --- XGBoost ---
xgb_params = {
    'n_estimators': [300, 500],
    'max_depth': [6, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}
xgb = GridSearchCV(XGBRegressor(random_state=42, n_jobs=-1),
                   xgb_params, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
xgb.fit(X_train, y_train)
evaluate_model(xgb.best_estimator_, X_val, y_val, X_test, y_test, "XGBoost")

Fitting 3 folds for each of 16 candidates, totalling 48 fits

XGBoost Evaluation:
Validation RMSE: 0.0501
Test RMSE:       0.0504
Test MAE:        0.0374
Test R²:         0.979


In [35]:
# --- LightGBM ---
lgbm_params = {
    'n_estimators': [300, 500],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 63],
    'max_depth': [-1, 10]
}
lgbm = GridSearchCV(LGBMRegressor(random_state=42),
                    lgbm_params, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
lgbm.fit(X_train, y_train)
evaluate_model(lgbm.best_estimator_, X_val, y_val, X_test, y_test, "LightGBM")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1861
[LightGBM] [Info] Number of data points in the train set: 65028, number of used features: 36
[LightGBM] [Info] Start training from score 13.100498
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1869
[LightGBM] [Info] Number of data points in the train set: 65028, number of used features: 36
[LightGBM] [Info] Start training from score 13.099826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough

In [37]:
# --- CatBoost ---
cat = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=100,
    random_state=42
)
cat.fit(X_train, y_train, eval_set=(X_val, y_val))
evaluate_model(cat, X_val, y_val, X_test, y_test, "CatBoost")

0:	learn: 0.3310822	test: 0.3321869	best: 0.3321869 (0)	total: 77.7ms	remaining: 38.8s
100:	learn: 0.0809688	test: 0.0820034	best: 0.0820034 (100)	total: 836ms	remaining: 3.3s
200:	learn: 0.0670162	test: 0.0682304	best: 0.0682304 (200)	total: 1.62s	remaining: 2.41s
300:	learn: 0.0602926	test: 0.0619005	best: 0.0619005 (300)	total: 2.27s	remaining: 1.5s
400:	learn: 0.0565324	test: 0.0584757	best: 0.0584757 (400)	total: 2.89s	remaining: 713ms
499:	learn: 0.0540971	test: 0.0563712	best: 0.0563712 (499)	total: 3.49s	remaining: 0us

bestTest = 0.05637116981
bestIteration = 499


CatBoost Evaluation:
Validation RMSE: 0.0564
Test RMSE:       0.0567
Test MAE:        0.0426
Test R²:         0.973


KNN

In [39]:
from sklearn.neighbors import KNeighborsRegressor

# Simple Grid Search
knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1=Manhattan, 2=Euclidean
}

knn = GridSearchCV(KNeighborsRegressor(), knn_params, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
knn.fit(X_train, y_train)

print("Best KNN params:", knn.best_params_)

evaluate_model(knn.best_estimator_, X_val, y_val, X_test, y_test, "KNN Regressor")

Best KNN params: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}

KNN Regressor Evaluation:
Validation RMSE: 0.0865
Test RMSE:       0.0866
Test MAE:        0.0645
Test R²:         0.937


DNN

In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Build DNN
dnn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # output layer for regression
])

dnn.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_squared_error'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train
history = dnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=256,
    callbacks=[early_stop],
    verbose=2
)

# Evaluate
evaluate_model(dnn, X_val, y_val, X_test, y_test, "DNN")

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-10-25 16:22:50.201287: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


382/382 - 2s - 4ms/step - loss: 5.5348 - mean_squared_error: 230.3778 - val_loss: 0.2986 - val_mean_squared_error: 0.1378
Epoch 2/100
382/382 - 1s - 2ms/step - loss: 0.7690 - mean_squared_error: 1.0447 - val_loss: 1.1978 - val_mean_squared_error: 1.5434
Epoch 3/100
382/382 - 1s - 2ms/step - loss: 0.7144 - mean_squared_error: 0.8630 - val_loss: 0.6610 - val_mean_squared_error: 0.5319
Epoch 4/100
382/382 - 1s - 2ms/step - loss: 0.6294 - mean_squared_error: 0.6701 - val_loss: 1.5600 - val_mean_squared_error: 2.5228
Epoch 5/100
382/382 - 1s - 3ms/step - loss: 0.5491 - mean_squared_error: 0.5043 - val_loss: 0.9025 - val_mean_squared_error: 0.8882
Epoch 6/100
382/382 - 1s - 2ms/step - loss: 0.5162 - mean_squared_error: 0.4468 - val_loss: 1.1793 - val_mean_squared_error: 1.4490
Epoch 7/100
382/382 - 1s - 2ms/step - loss: 0.4865 - mean_squared_error: 0.3958 - val_loss: 1.9954 - val_mean_squared_error: 4.0308
Epoch 8/100
382/382 - 1s - 2ms/step - loss: 0.4312 - mean_squared_error: 0.3131 - val_

Save models

In [None]:
import joblib

# Save a model
# linear
joblib.dump(lr, "models/lr_model.pkl")
joblib.dump(ridge, "models/ridge_model.pkl")
joblib.dump(lasso, "models/lasso_model.pkl")
# random forest
joblib.dump(rf, "models/rf_model.pkl")
# boosting
joblib.dump(xgb, "models/xgb_model.pkl")
joblib.dump(lgbm, "models/lgbm_model.pkl")
joblib.dump(cat, "models/cat_model.pkl")
#knn
joblib.dump(knn, "models/knn_model.pkl")
#nn
joblib.dump(dnn, "models/dnn_model.pkl")

In [None]:
# Load it later
loaded_model = joblib.load("rf_model.pkl")
y_pred = loaded_model.predict(X_test)