In [1]:
train_data <- read.csv("C:/Users/swift/Desktop/train.csv")

In [2]:
treatment1 <- function(df_in){
    df_in$price <- as.numeric(df_in$price)
    df_in$age <- 2024 - df_in$year
    df_in <- na.omit(df_in)
    return(df_in)
}

In [3]:
treatment2 <- function(df_in){
    outlier_price <- quantile(df_in$price, 0.95)
    df_in <- df_in[df_in$price < outlier_price, ]
    return(df_in)
}

In [4]:
treatment3 <- function(df_in){
    upper_threshold <- quantile(df_in$engine_size, 0.99)
    df_in <- df_in[df_in$engine_size <= upper_threshold, ]
    return(df_in)
}

In [5]:
train_data <- treatment1(train_data)

"NAs introduced by coercion"


In [6]:
train_data <- treatment2(train_data)

In [7]:
train_data <- treatment3(train_data)

In [8]:
formula_step <- formula("price ~ age * mileage + engine_size + max_mpg + automatic_transmission + damaged + first_owner + 
                        personal_using + turbo + alloy_wheels + adaptive_cruise_control + navigation_system + 
                        power_liftgate + backup_camera + keyless_start + remote_start + sunroof.moonroof + 
                        automatic_emergency_braking + stability_control + leather_seats + memory_seat + 
                        third_row_seating + apple_car_play.android_auto + bluetooth + usb_port + heated_seats + 
                        brand + fuel_type")

step_model <- step(lm(formula_step, data = train_data), direction = "both")

summary(step_model)

Start:  AIC=250660.4
price ~ age * mileage + engine_size + max_mpg + automatic_transmission + 
    damaged + first_owner + personal_using + turbo + alloy_wheels + 
    adaptive_cruise_control + navigation_system + power_liftgate + 
    backup_camera + keyless_start + remote_start + sunroof.moonroof + 
    automatic_emergency_braking + stability_control + leather_seats + 
    memory_seat + third_row_seating + apple_car_play.android_auto + 
    bluetooth + usb_port + heated_seats + brand + fuel_type

                              Df  Sum of Sq        RSS    AIC
- sunroof.moonroof             1 2.2030e+07 6.6081e+11 250659
- automatic_emergency_braking  1 7.5803e+07 6.6087e+11 250660
<none>                                      6.6079e+11 250660
- usb_port                     1 1.4752e+08 6.6094e+11 250662
- first_owner                  1 1.8826e+08 6.6098e+11 250662
- keyless_start                1 1.9509e+08 6.6098e+11 250663
- heated_seats                 1 2.5711e+08 6.6105e+11 250664



Call:
lm(formula = price ~ age + mileage + engine_size + max_mpg + 
    automatic_transmission + damaged + first_owner + personal_using + 
    turbo + alloy_wheels + adaptive_cruise_control + navigation_system + 
    power_liftgate + backup_camera + keyless_start + remote_start + 
    stability_control + leather_seats + memory_seat + third_row_seating + 
    apple_car_play.android_auto + bluetooth + usb_port + heated_seats + 
    brand + fuel_type + age:mileage, data = train_data)

Residuals:
   Min     1Q Median     3Q    Max 
-29764  -4196   -663   3233  51133 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  4.269e+04  1.016e+03  42.029  < 2e-16 ***
age                         -1.563e+03  2.995e+01 -52.175  < 2e-16 ***
mileage                     -1.973e-01  3.575e-03 -55.194  < 2e-16 ***
engine_size                  4.980e+03  8.354e+01  59.612  < 2e-16 ***
max_mpg                     -7.822e+01  1.116e+01  -7.008 2

In [23]:
test_data <- read.csv("C:/Users/swift/Desktop/test.csv")

In [24]:
test_data <- treatment1(test_data)

"NAs introduced by coercion"


In [25]:
test_data <- treatment2(test_data)

In [26]:
test_data <- treatment3(test_data)

In [27]:
pred_eval <- function(model, new_data){
    
    predictions <- predict(model, newdata = new_data)

    actual_values <- new_data$price

    RMSE <- sqrt(mean((actual_values - predictions)^2))
    MAE <- mean(abs(actual_values - predictions))
    R_squared <- cor(actual_values, predictions)^2

    cat("RMSE:", RMSE, '\n')
    cat("MAE:", MAE, '\n')
    cat("R-squared:", R_squared, '\n')

}

In [28]:
pred_eval(step_model, test_data)

RMSE: 6598.199 
MAE: 4885.477 
R-squared: 0.7869633 


In [29]:
# install.packages(c("caret", "yardstick"))
# install.packages("glmnet")
# install.packages("kernlab")
library(caret)
# library(yardstick)
library(glmnet)
# library(kernlab)

In [21]:
# Train Ridge Regression model
ridge_model <- train(formula_step, data = train_data, method = "glmnet", trControl = trainControl(method = "cv"))

# Make predictions on test data
ridge_predictions <- predict(ridge_model, newdata = test_data)

# Evaluate performance using metrics
# ridge_performance <- yardstick::metrics(data.frame(.pred = ridge_predictions, .actual = test_data$target_variable), metric = "rmse")
# print(ridge_performance)


In [22]:
pred_eval(ridge_model, test_data)

RMSE: 6595.179 
MAE: 4880.024 
R-squared: 0.7870363 


In [None]:
# Train SVR model
svm_model <- train(formula_step, data = train_data, method = "svmLinear", trControl = trainControl(method = "cv"))

# Make predictions on test data
svm_predictions <- predict(svm_model, newdata = test_data)
