In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
train_df = pd.read_csv('train_simp_fixedOrdinal_FE_noscale(15k).csv')

In [3]:
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

In [4]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [5]:
# Initialize the XGBRegressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Set up RandomizedSearchCV using negative RMSE as the scoring metric
random_search = RandomizedSearchCV(estimator=xgb_reg,
                                   param_distributions=param_grid,
                                   n_iter=50,
                                   cv=3,
                                   scoring='neg_root_mean_squared_error',
                                   random_state=42,
                                   n_jobs=-1,
                                   verbose=1)

In [6]:
y_train = train_df['SalePrice']
train_df.drop(['SalePrice' , 'Id'], axis=1, inplace=True)
X_train = train_df

In [7]:
y_val = val_data['SalePrice']
val_data.drop(['SalePrice' , 'Id'], axis=1, inplace=True)
X_val = val_data

In [8]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [9]:
# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

In [10]:
# Evaluate on the validation set
val_predictions = best_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
print("Best Parameters:", best_params)
print("Validation RMSE:", rmse_val)

Best Parameters: {'subsample': 0.6, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
Validation RMSE: 14683.775536284937


In [11]:
# Combine training and validation sets for final training
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

# Train the final XGBRegressor using the best parameters
final_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
final_model.fit(X_full, y_full)

# Load and preprocess the test data
test_df = pd.read_csv('test_simp_fixedOrdinal_FE_noscale(15k).csv')
# Save the Ids for the submission file
test_ids = test_df['Id'].copy()

In [12]:
X_test = test_df
# Align test set features with training features
X_test = X_test.reindex(columns=X_full.columns, fill_value=0)

In [13]:
# Generate predictions for the test set
test_predictions = final_model.predict(X_test)

In [None]:
# Create a submission dataframe (assumes sample_submission.csv expects 'Id' and 'SalePrice')
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv('submission_XGBoost(2).csv', index=False)

print("Submission file 'submission_XGBoost.csv(2)' created successfully!")

Submission file 'submission_XGBoost.csv(2)' created successfully!


: 