In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load your data
df = pd.read_csv("final_train.csv")
final_test_df=pd.read_csv("final_test.csv")

#Split into train + temp (which will later be split into val/test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

#Split temp into validation and test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check sizes
print(f"Train: {len(train_df)} rows")
print(f"Validation: {len(val_df)} rows")
print(f"Test: {len(test_df)} rows")

# train_df.to_csv("dataset/data_split/train.csv", index=False)
# val_df.to_csv("dataset/data_split/validate.csv", index=False)
# test_df.to_csv("dataset/data_split/test.csv", index=False)

# Define target column name
target_col = "LOG_RESALE_PRICE"

# Split into X (features) and y (target)
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_val = val_df.drop(columns=[target_col])
y_val = val_df[target_col]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

X_full_train=df.drop(columns=[target_col])
y_full_train=df[target_col]

print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Val shapes:   X={X_val.shape}, y={y_val.shape}")
print(f"Test shapes:  X={X_test.shape}, y={y_test.shape}")

Train: 97542 rows
Validation: 32514 rows
Test: 32514 rows
Train shapes: X=(97542, 36), y=(97542,)
Val shapes:   X=(32514, 36), y=(32514,)
Test shapes:  X=(32514, 36), y=(32514,)


In [22]:
#"DIST_PRIM","DIST_HAWKER","MONTH","COUNT_SEC"
#low_corr=["DIST_PRIM","DIST_HAWKER","MONTH","COUNT_SEC"]
#low_corr=["DIST_PRIM","COUNT_SEC"]
low_corr=["DIST_PRIM"]
X_train_compare=X_train.drop(columns=low_corr)
X_val_compare=X_val.drop(columns=low_corr)
X_col_compare=X_test.drop(columns=low_corr)

Linear model x  
Lasso model x  
ENet model  
SVR model  
Random Forest model  
XGB model  
LightGBM model x  
Stacking model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
results = []
def evaluate_model(model, X_val, y_val, X_test, y_test, name="Model"):
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    rmse_val = np.sqrt(mean_squared_error(y_val, val_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, test_pred))
    mae_test = mean_absolute_error(y_test, test_pred)
    r2_test = r2_score(y_test, test_pred)
    print(f"\n{name} Evaluation:")
    print(f"Validation RMSE: {rmse_val:.4f}")
    print(f"Test RMSE:       {rmse_test:.4f}")
    print(f"Test MAE:        {mae_test:.4f}")
    print(f"Test R²:         {r2_test:.3f}")
    #return {"model": name, "val_rmse": rmse_val, "test_rmse": rmse_test, "test_mae": mae_test, "test_r2": r2_test}

In [23]:
# Linear model
m_linear = LinearRegression()
m_linear.fit(X_train, y_train)
print(evaluate_model(m_linear, X_val, y_val, X_test, y_test, "Linear model"))

m_linear.fit(X_train_compare, y_train)
print(evaluate_model(m_linear, X_val_compare, y_val, X_col_compare, y_test, "Linear model_drop"))


Linear model Evaluation:
Validation RMSE: 0.1104
Test RMSE:       0.1112
Test MAE:        0.0862
Test R²:         0.896
None

Linear model_drop Evaluation:
Validation RMSE: 0.1104
Test RMSE:       0.1112
Test MAE:        0.0862
Test R²:         0.896
None


In [8]:
# adaptive lasso
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

alphas = np.logspace(-4, 2, 50)  # 1e-4 到 1e2 的对数网格
m = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", LassoCV(alphas=alphas, cv=5, max_iter=10000, tol=1e-4, random_state=0))
])
m.fit(X_train, y_train)

best_alpha = m.named_steps["lasso"].alpha_
coef = m.named_steps["lasso"].coef_
print(f"Best alpha: {best_alpha}")
evaluate_model(m, X_val, y_val, X_test, y_test, "Lasso model")

Best alpha: 0.0001

Lasso model Evaluation:
Validation RMSE: 0.1104
Test RMSE:       0.1112
Test MAE:        0.0862
Test R²:         0.896


In [None]:
# adaptive enet
from sklearn.linear_model import ElasticNetCV

alphas = np.logspace(-4, 2, 60)             # 1e-4 ~ 1e2
l1s    = np.concatenate([np.linspace(0.05, 0.95, 10), [1.0]])  # 含纯 Lasso

enet_cv = Pipeline([
    ("scaler", StandardScaler()),
    ("enet", ElasticNetCV(
        alphas=alphas,
        l1_ratio=l1s,
        cv=5,
        max_iter=10000,
        tol=1e-4,
        selection="cyclic",   # 可换 'random' 试试
        random_state=0
    ))
])

enet_cv.fit(X_train, y_train)

best_alpha   = enet_cv.named_steps["enet"].alpha_
best_l1ratio = enet_cv.named_steps["enet"].l1_ratio_
best_coef    = enet_cv.named_steps["enet"].coef_

# 接到你的评测框架
results.append(evaluate_model(enet_cv, X_val, y_val, X_test, y_test, "ENetCV(best)"))



ENetCV(best) Evaluation:
Validation RMSE: 0.1104
Test RMSE:       0.1112
Test MAE:        0.0862
Test R²:         0.896


In [10]:
# 3.5 Random Forest model
m_rf = RandomForestRegressor(random_state=42)
m_rf.fit(X_train, y_train)
print(evaluate_model(m_rf, X_val, y_val, X_test, y_test, "Random Forest model"))

m_rf.fit(X_train_compare, y_train)
print(evaluate_model(m_rf, X_val_compare, y_val, X_col_compare, y_test, "Random Forest model_drop"))


Random Forest model Evaluation:
Validation RMSE: 0.0572
Test RMSE:       0.0578
Test MAE:        0.0424
Test R²:         0.972
None

Random Forest model_drop Evaluation:
Validation RMSE: 0.0571
Test RMSE:       0.0576
Test MAE:        0.0423
Test R²:         0.972
None


In [5]:
# 3.6 XGB model
from xgboost import XGBRegressor
m_xgb = None
m_xgb = XGBRegressor(
    random_state=42,
    n_estimators=100, max_depth=6, learning_rate=0.3,
    subsample=1.0, colsample_bytree=1.0, reg_lambda=1.0,
    tree_method="hist"
)
# m_xgb.fit(X_train, y_train)
# print(evaluate_model(m_xgb, X_val, y_val, X_test, y_test, "XGB model"))

# m_xgb.fit(X_train_compare, y_train)
# print(evaluate_model(m_xgb, X_val_compare, y_val, X_col_compare, y_test, "XGB model_drop"))

m_xgb.fit(X_full_train, y_full_train)
y_pred_full = m_xgb.predict(final_test_df)
y_pred_full_real = np.expm1(y_pred_full)
df_pred = pd.DataFrame({
    "Id": np.arange(len(y_pred_full_real)),
    "Predicted": y_pred_full_real
})
df_pred.to_csv("xgb_full_predictions.csv", index=False)

In [16]:
# 3.7 LightGBM model
from lightgbm import LGBMRegressor
m_lgbm = None
m_lgbm = LGBMRegressor(random_state=42)
m_lgbm.fit(X_train, y_train)
print(evaluate_model(m_lgbm, X_val, y_val, X_test, y_test, "LightGBM model"))

m_lgbm.fit(X_train_compare, y_train)
print(evaluate_model(m_lgbm, X_val_compare, y_val, X_col_compare, y_test, "LightGBM model_drop"))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1895
[LightGBM] [Info] Number of data points in the train set: 97542, number of used features: 36
[LightGBM] [Info] Start training from score 13.100217

LightGBM model Evaluation:
Validation RMSE: 0.0653
Test RMSE:       0.0657
Test MAE:        0.0498
Test R²:         0.964
None
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1363
[LightGBM] [Info] Number of data points in the train set: 97542, number of used features: 32
[LightGBM] [Info] Start training from score 13.100217

LightGBM model_drop Evaluation:
Validation RMSE: 0.0679
T