In [37]:
import pandas as pd
import numpy as np
import ydf
from sklearn.metrics import mean_squared_error

# Load the training data
train_data = pd.read_csv('cleaned_train_data.csv')

# Drop 'LogSalePrice' and 'Id' from the features
X_train = train_data.drop(columns=['LogSalePrice', 'SalePrice','Id'])
y_train = train_data['LogSalePrice']  # This is the target variable

# Combine X_train and y_train into a single DataFrame for model training
train_df = pd.concat([X_train, y_train], axis=1)

# Train a Random Forest model for regression
model = ydf.RandomForestLearner(task=ydf.Task.REGRESSION, label="LogSalePrice").train(train_df)

# Evaluate the model on the training data
train_predictions = model.predict(train_df)
rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
print(f"Training RMSE: {rmse}")

# Load the test data
test_data = pd.read_csv('cleaned_test.csv')
test_ids = test_data['Id']  # Keep the test IDs for submission
X_test = test_data.drop(columns=['Id'])  # Drop 'Id' from the test features

# Predict LogSalePrice for the test data
log_sale_price_pred = model.predict(X_test)

# Convert LogSalePrice to SalePrice using exponentiation
sale_price_pred = np.exp(log_sale_price_pred)

# Prepare the submission file with 'Id' and 'SalePrice'
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': sale_price_pred})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")


Train model on 1451 examples
Model trained in 0:00:01.173298
Training RMSE: 0.07382396023952678
Submission file created: submission.csv
