In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import numpy as np
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_path = '/kaggle/input/home-data-for-ml-course/train.csv'
test_path = '/kaggle/input/home-data-for-ml-course/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [None]:
def feature_engineering(df):
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['RemodelAge'] = df['YrSold'] - df['YearRemodAdd']
    return df

train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

X_train = train_data.drop(columns=['Id', 'SalePrice'])
y_train = train_data['SalePrice']

X_test = test_data.drop(columns=['Id'])

In [None]:
numeric_features = [col for col in ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
                                    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                                    '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
                                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
                                    'HouseAge', 'RemodelAge'] if col in train_data.columns]

categorical_features = [col for col in ['MSSubClass', 'MSZoning', 'Neighborhood', 'BldgType', 'HouseStyle',
                                        'OverallQual', 'OverallCond', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                                        'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'CentralAir',
                                        'Electrical', 'KitchenQual', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
                                        'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive',
                                        'Fence', 'SaleType', 'SaleCondition'] if col in train_data.columns]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))])

param_dist = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

In [None]:
random_search = RandomizedSearchCV(model, param_dist, n_iter=50, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best parameters found: ", best_params)

In [None]:
model.set_params(**best_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_pred})
output.to_csv('submission.csv', index=False)