In [3]:
!pip install xgboost


Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/a2/48/d5da8591891327b0faf08179d420fba3893c6134bdd449497c5329e4cb01/xgboost-2.1.0-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 1.1 MB/s eta 0:01:59
   ---------------------------------------- 0.6/124.9 MB 5.1 MB/s eta 0:00:25
   ---------------------------------------- 1.1/124.9 MB 7.1 MB/s eta 0:00:18
    --------------------------------------- 1.8/124.9 MB 8.1 MB/s eta 0:00:16
    --------------------------------------- 2.5/124.9 MB 9.3 MB/s eta 0:00:14
   - -------------------------------------- 3.3/124.9 MB 11.1 MB/s eta 0:00:11
   - -------------------------------

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Load the data
train_data = pd.read_csv(r"C:\Users\kunta\Downloads\house-prices-advanced-regression-techniques\train.csv")
test_data = pd.read_csv(r"C:\Users\kunta\Downloads\house-prices-advanced-regression-techniques\test.csv")

# Separate target from predictors
y = train_data.SalePrice
X = train_data.drop(['SalePrice'], axis=1)

# Handle missing values
imputer = SimpleImputer(strategy='median')
X_num = pd.DataFrame(imputer.fit_transform(X.select_dtypes(exclude=['object'])), columns=X.select_dtypes(exclude=['object']).columns)
test_data_num = pd.DataFrame(imputer.transform(test_data.select_dtypes(exclude=['object'])), columns=test_data.select_dtypes(exclude=['object']).columns)

# Encode categorical variables
label_encoder = LabelEncoder()
X_cat = X.select_dtypes(include=['object']).apply(lambda col: label_encoder.fit_transform(col.astype(str)))
test_data_cat = test_data.select_dtypes(include=['object']).apply(lambda col: label_encoder.fit_transform(col.astype(str)))

# Concatenate numerical and categorical data
X = pd.concat([X_num, X_cat], axis=1)
test_data_processed = pd.concat([test_data_num, test_data_cat], axis=1)

# Convert all column names to strings to avoid TypeError
X.columns = X.columns.astype(str)
test_data_processed.columns = test_data_processed.columns.astype(str)

# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
test_data_scaled = scaler.transform(test_data_processed)

# Define the model
model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_valid)
rmse = mean_squared_error(y_valid, predictions, squared=False)
print(f'RMSE: {rmse}')

# Prepare the test data predictions for submission
test_predictions = model.predict(test_data_scaled)
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})

# Ensure SalePrice values have appropriate precision
output['SalePrice'] = output['SalePrice'].round(4)

# Save to CSV
output.to_csv('submission.csv', index=False)


RMSE: 28828.41015741964
