In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load the training data
train_df = pd.read_csv('train.csv')

In [3]:
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

In [4]:
def preprocess_data(df, is_train=True):
    """
    Preprocess the dataframe:
      - Drop the 'Id' column if present.
      - For training data, separate out the target variable ('SalePrice').
      - Create missing indicators and fill missing values with meaningful values.
      - For specific categorical features where missing implies absence, fill with 'None'.
      - For specific numeric features where missing implies absence, fill with 0.
      - For other numeric features, fill missing with median.
      - For remaining categorical features, fill missing with mode.
      - One-hot encode categorical features.
    """
    df = df.copy()

    # Drop the 'Id' column if it exists
    if 'Id' in df.columns:
        df.drop('Id', axis=1, inplace=True)

    # If training data, separate out the target variable
    if is_train:
        y = df['SalePrice']
        df.drop('SalePrice', axis=1, inplace=True)

    # Dictionaries for known columns where missing implies absence
    missing_cat_dict = {
        'Alley': 'None',
        'PoolQC': 'None',
        'Fence': 'None',
        'MiscFeature': 'None',
        'GarageType': 'None',
        'GarageFinish': 'None',
        'GarageQual': 'None',
        'GarageCond': 'None',
        'BsmtQual': 'None',
        'BsmtCond': 'None',
        'BsmtExposure': 'None',
        'BsmtFinType1': 'None',
        'BsmtFinType2': 'None'
    }

    missing_num_dict = {
        'GarageYrBlt': 0,
        'MasVnrArea': 0,
        'BsmtFinSF1': 0,
        'BsmtFinSF2': 0,
        'BsmtUnfSF': 0,
        'TotalBsmtSF': 0,
        'BsmtFullBath': 0,
        'BsmtHalfBath': 0
    }

    # Process categorical columns with known missing value meaning
    for col, fill_value in missing_cat_dict.items():
        if col in df.columns:
            # Create a missing indicator
            df[col + '_missing'] = df[col].isnull().astype(int)
            df[col].fillna(fill_value, inplace=True)

    # Process numeric columns with known missing value meaning
    for col, fill_value in missing_num_dict.items():
        if col in df.columns:
            df[col + '_missing'] = df[col].isnull().astype(int)
            df[col].fillna(fill_value, inplace=True)

    # For remaining columns, fill missing values:
    # - For numeric columns, use the median.
    # - For categorical columns, use the mode.
    for col in df.columns:
        # Skip if already processed in our dictionaries
        if col in missing_cat_dict or col in missing_num_dict or col.endswith('_missing'):
            continue

        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)

    # One-hot encode categorical variables (including our new indicator columns will remain numeric)
    df = pd.get_dummies(df, drop_first=True)

    if is_train:
        return df, y
    else:
        return df

In [5]:
# Preprocess training and validation data separately
X_train, y_train = preprocess_data(train_data, is_train=True)
X_val, y_val = preprocess_data(val_data, is_train=True)

# Ensure training and validation sets have the same features (columns)
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

In [6]:
param_grid = {
    'n_estimators': [300, 400, 500, 600],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [7]:
# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

In [8]:
random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_grid,
                                   n_iter=50,
                                   cv=3,
                                   scoring='neg_root_mean_squared_error',
                                   random_state=42,
                                   n_jobs=-1,
                                   verbose=1)

In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

In [None]:
# Evaluate on the validation set
val_predictions = best_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
print("Best Parameters:", best_params)
print("Validation RMSE:", rmse_val)

In [None]:
# Combine training and validation sets for final training
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

In [None]:
# Train a final RandomForestRegressor using the best parameters
final_model = RandomForestRegressor(**best_params, random_state=42)
final_model.fit(X_full, y_full)

In [None]:
# Load and preprocess the test data
test_df = pd.read_csv('test.csv')

In [None]:
# Save the Ids for the submission file
test_ids = test_df['Id'].copy()

In [None]:
X_test = preprocess_data(test_df, is_train=False)
# Align test set features with training features
X_test = X_test.reindex(columns=X_full.columns, fill_value=0)

In [None]:
# Generate predictions for the test set
test_predictions = final_model.predict(X_test)

In [None]:
# Create a submission dataframe (assuming the sample_submission.csv expects 'Id' and 'SalePrice')
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")

In [None]:
import xgboost as xgb

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [None]:
# Initialize the XGBRegressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Set up RandomizedSearchCV using negative RMSE as the scoring metric
random_search = RandomizedSearchCV(estimator=xgb_reg,
                                   param_distributions=param_grid,
                                   n_iter=50,
                                   cv=3,
                                   scoring='neg_root_mean_squared_error',
                                   random_state=42,
                                   n_jobs=-1,
                                   verbose=1)

In [None]:
# Fit the random search model on the training data
random_search.fit(X_train, y_train)

In [None]:
# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

In [None]:
# Evaluate on the validation set
val_predictions = best_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
print("Best Parameters:", best_params)
print("Validation RMSE:", rmse_val)

In [None]:
# Combine training and validation sets for final training
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

# Train the final XGBRegressor using the best parameters
final_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
final_model.fit(X_full, y_full)

# Load and preprocess the test data
test_df = pd.read_csv('test.csv')
# Save the Ids for the submission file
test_ids = test_df['Id'].copy()

In [None]:
X_test = preprocess_data(test_df, is_train=False)
# Align test set features with training features
X_test = X_test.reindex(columns=X_full.columns, fill_value=0)

In [None]:
# Generate predictions for the test set
test_predictions = final_model.predict(X_test)

In [None]:
# Create a submission dataframe (assumes sample_submission.csv expects 'Id' and 'SalePrice')
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv('submission_XGBoost.csv', index=False)

print("Submission file 'submission_XGBoost.csv' created successfully!")