In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
df = pd.read_csv("./dataset/processed-data/final_train.csv")

#Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

#Check sizes
print(f"Train: {len(train_df)} rows")
print(f"Test: {len(test_df)} rows")

Train: 130056 rows
Test: 32514 rows


In [2]:
# Define target column name
target_col = "LOG_RESALE_PRICE"

# Split into X (features) and y (target)
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Test shapes:  X={X_test.shape}, y={y_test.shape}")

Train shapes: X=(130056, 36), y=(130056,)
Test shapes:  X=(32514, 36), y=(32514,)


In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_log_model(model, X_test, y_test_log, name):
    # Predict in log-space
    y_pred_log = model.predict(X_test)
    
    # Convert back to price
    y_pred = np.exp(y_pred_log)
    y_true = np.exp(y_test_log)

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n==== {name} — Final Test Performance ====")
    print(f"RMSE: {rmse:,.0f}")
    print(f"MAE:  {mae:,.0f}")
    print(f"R²:   {r2:.4f}")

In [4]:
from sklearn.model_selection import GridSearchCV, KFold

def tune_and_train(model, param_grid, X_train, y_train_log, model_name, k=5):

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # CV scoring in *log space*, since RMSE will be converted later
    search = GridSearchCV(
        model,
        param_grid,
        cv=kf,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1
    )

    search.fit(X_train, y_train_log)

    print(f"---- {model_name} ----")
    print("Best params:", search.best_params_)
    print(f"CV RMSE (log-space): {-search.best_score_:.4f}")

    # Refit best model on full train data
    best_model = search.best_estimator_
    best_model.fit(X_train, y_train_log)

    return best_model

KNN

In [5]:
from sklearn.neighbors import KNeighborsRegressor

knn_params = {"n_neighbors": [3, 5, 7, 10, 25, 50, 100]}

best_knn = tune_and_train(
    KNeighborsRegressor(),
    knn_params, X_train, y_train,
    "KNN"
)

evaluate_log_model(best_knn, X_test, y_test, "KNN")

---- KNN ----
Best params: {'n_neighbors': 3}
CV RMSE (log-space): 0.0986

==== KNN — Final Test Performance ====
RMSE: 51,806
MAE:  35,803
R²:   0.9194


MLP

In [12]:
from sklearn.neural_network import MLPRegressor

mlp_params = {
    "hidden_layer_sizes": [(256, 128, 64)],
    "learning_rate_init": [0.001, 0.01],
    "max_iter": [300, 500],
}

best_mlp = tune_and_train(
    MLPRegressor(random_state=42),
    mlp_params, X_train, y_train,
    "MLP"
)

evaluate_log_model(best_mlp, X_test, y_test, "MLP")

---- MLP ----
Best params: {'hidden_layer_sizes': (256, 128, 64), 'learning_rate_init': 0.01, 'max_iter': 300}
CV RMSE (log-space): 0.2621

==== MLP — Final Test Performance ====
RMSE: 120,575
MAE:  90,841
R²:   0.5636


Save models

In [None]:
import joblib

#knn
joblib.dump(best_knn, "models/knn_model.pkl")
#nn
joblib.dump(best_mlp, "models/mlp_model.pkl")