In [1]:
#Solving this problem using two methods i.e. using regression trees and using randomforests
#loading all the required packages
library(dplyr)
library(tidyr)
library(rpart)
library(randomForest)
library(ggplot2)

In [1]:
#load data
data_frame <- read.csv("../input/train.csv", stringsAsFactors = F)
row.names(data_frame) <- data_frame$Id
#removing the id column from the data frame
data_frame <- data_frame[,-1]
#setting all the na values to 0, if there are any
data_frame[is.na(data_frame)] <- 0

for(i in colnames(data_frame[,sapply(data_frame, is.character)])){
    data_frame[,i] <- as.factor(data_frame[,i]) 
}

In [1]:
#creating a sample vector of test values from train data
test.n <- sample(1:nrow(data_frame), nrow(data_frame)/3, replace = F)

#test dataset
test <- data_frame[test.n,]

#train dataset
train <- data_frame[-test.n,]
rm(test.n, data_frame)

In [1]:
#Evaluation metric, root mean square error
RootMeanSquareError <- function(a,b) {
    res <- sqrt(sum((log(a)-log(b))^2)/length(b))
    return(res)
}

In [1]:
#Using regression tree, rpart package
model <- rpart(SalePrice ~., data = train, method = "anova")
predict <- predict(model, test)

#finding error for the above predicted model
error1 <- RootMeanSquareError(predict, test$SalePrice)
error1 <- round(error1, digits = 4)
plot1 <- predict-test$SalePrice
error1

In [1]:
#Using RandomForest package
model <- randomForest(SalePrice ~., data = train, method = "anova", ntree = 300,
                      mtry = 26,
                      replace = F,
                      nodesize = 1,
                      importance = T)
predict <- predict(model, test)

#Finding error for the randomForests model
error2 <- RootMeanSquareError(predict, test$SalePrice)
error2 <- round(error2, digits = 4)
plot2 <- predict-test$SalePrice
error2

The root mean squared error for randomForests is less when compared to the regression tree model:

Regression: 0.2328

RandomForest: 0.1644