In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# 1. Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# 2. Features and target
if "id" in train.columns:
    X = train.drop(columns=["id", "target"])
else:
    X = train.drop(columns=["target"])

y = train["target"]

if "id" in test.columns:
    X_test = test.drop(columns=["id"])
else:
    X_test = test

# 3. Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. RandomForest model
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# 5. Validation performance (manual RMSE for old sklearn)
valid_pred = rf_model.predict(X_valid)
mse = mean_squared_error(y_valid, valid_pred)
rmse = np.sqrt(mse)
print("✅ Validation RMSE:", rmse)

# 6. Test predictions
final_pred = rf_model.predict(X_test)

# 7. Submission file
submission = sample_submission.copy()
submission["target"] = final_pred
submission.to_csv("submission.csv", index=False)

print("🎉 Submission file created: submission.csv")
print(submission.head())


✅ Validation RMSE: 20.605999743143325
🎉 Submission file created: submission.csv
   id      target
0   1  181.998838
1   2  188.606316
2   3  171.977723
3   4  174.825734
4   5  176.515307
