In [50]:
# Programmer: Sameer Nathawat
# Date Modified: July 28, 2023

# In this version, I edited the parameters of the RandomForestRegressor to reduce the mean absolute error (MAE).

# n_estimators is a parameter that changes the number of trees in the forest. In theory, increasing the number
# of trees should increase the accuracy to a certain threshold. In my case, 107 trees per forest was the optimal
# number as it reduced the MAE from 23,483.29 to 23,408.89.

# max_depth is a parameter which controls the maximum depth of a tree within the forest. This affects the number of times
# the data is passed through a decision node (used to seperate the data into categories). Controlling the maximum depth is useful
# in preventing the model from overfitting to our data and giving the model enough samples to make generalized predictions. In my
# case, limiting the maximum depth to 9 led to a decrease of the MAE from 23,408.89 to 22,717.37.

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import os

        
# reading the csv files
house_prices_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
# house_prices_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# creating a list of features
features = ['LotArea', 'YearBuilt', 'OverallQual', 'TotRmsAbvGrd']
# training dataframe with selected features
X = house_prices_train[features]
# training label/target
y = house_prices_train.SalePrice

# splitting the data into a testing and validation set
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

# # testing dataframe with selected features
# test_X = house_prices_test[features]
# # testing label/target
# print(house_prices_test.columns)
# test_y = house_prices_test.SalePrice


# creating the model 
house_prices_model = RandomForestRegressor(n_estimators = 107, max_depth = 9, random_state = 0) # 100 = 23483.29, 107 = 23408.89, 107 with 9 = 22717.37
# fit the model
house_prices_model.fit(train_X, train_y)
# make predictions using the test data
predictions = house_prices_model.predict(val_X)
# compute mean absolute error
error = mean_absolute_error(predictions, val_y)
# prints the mean absolute error
print("MAE:",'%.2f' % error)


MAE: 23483.29
