In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# Load the data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Separate features and target
X = train_data.drop(['Id', 'SalePrice'], axis=1)
y = train_data['SalePrice']
test_features = test_data.drop('Id', axis=1)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a preprocessing and modeling pipeline
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

xgb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', xgb.XGBRegressor(n_estimators=100, random_state=42))])

lgb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', lgb.LGBMRegressor(n_estimators=100, random_state=42))])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
models = [rf_model, xgb_model, lgb_model]
model_names = ['Random Forest', 'XGBoost', 'LightGBM']

for model, name in zip(models, model_names):
    model.fit(X_train, np.log1p(y_train))
    val_predictions = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(np.log1p(y_val), val_predictions))
    print(f"{name} Validation RMSE: {val_rmse}")

# Choose the best model (for this example, let's assume XGBoost performed best)
best_model = xgb_model

# Retrain on full training data
best_model.fit(X, np.log1p(y))

# Make predictions on test data
test_predictions = np.expm1(best_model.predict(test_features))

# Create submission file
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_predictions
})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

# Feature importance (for XGBoost)
feature_importance = best_model.named_steps['regressor'].feature_importances_
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False).head(20)

print("\nTop 20 Most Important Features:")
print(importance_df)