In [None]:

library(dplyr)
library(Matrix)
library(parallel)
library(doParallel)
library(foreach)

In [None]:
final_analysis_df = readRDS("final_analysis_df.rds")
df_for_encoding = readRDS("df_for_encoding.rds")

In [None]:
library(dplyr)

final_analysis_df = final_analysis_df %>% rename(mean_income_pc = 'mean(income_pc)')

In [None]:
library(fastDummies)
library(Matrix)
library(dplyr)

# Define your categorical and continuous variables
categorical_vars <- c("customer_zip_code_prefix", "customer_city", "customer_state", 
                      "most_common_weekday", "most_common_month", "urban_rural")
continuous_vars <- c("avg_installments", "reviews_per_customer", "avg_review_score_per_customer", "mean_income_pc")

final_analysis_df[continuous_vars] <- scale(final_analysis_df[continuous_vars])


# Function to process a batch of data
process_batch <- function(batch) {
  # Convert categorical variables to factors
  batch[categorical_vars] <- lapply(batch[categorical_vars], as.factor)
  
  # Apply the fastDummies encoding for each categorical variable
  for (cat_var in categorical_vars) {
    batch <- dummy_cols(batch, select_columns = cat_var, remove_first_dummy = TRUE)
  }
  
  return(batch)
}

# Split the data into batches
batch_size <- 5000  # Adjust this based on your system's memory capacity
num_batches <- ceiling(nrow(final_analysis_df) / batch_size)
batches <- split(final_analysis_df, ceiling(seq_len(nrow(final_analysis_df)) / batch_size))

# Process each batch and combine
processed_batches <- lapply(batches, process_batch)
df_combined <- do.call(rbind, processed_batches)

# Combine with continuous data
df_combined <- cbind(df_combined, final_analysis_df[continuous_vars])

# Save the identity and outcome variables separately
identity_var <- final_analysis_df$customer_unique_id
outcome_var <- final_analysis_df$RFM_avg

# Exclude identity and outcome variables from the combined dataframe
df_combined <- df_combined[, !(names(df_combined) %in% c("customer_unique_id", "RFM_avg"))]

# Remove the original categorical columns from df_combined
df_combined <- df_combined[, !(names(df_combined) %in% categorical_vars)]


# Convert to a sparse matrix
df_sparse <- as(as.matrix(df_combined), "sparseMatrix")


In [None]:
all_numeric <- all(sapply(df_combined, is.numeric))
print(paste("All columns in df_combined are numeric:", all_numeric))

In [None]:
# Save the sparse matrix to an RDS file
saveRDS(df_sparse, file = "df_sparse.rds")
saveRDS(df_combined, file = "df_combined.rds")

In [2]:
installed_pkgs <- installed.packages()
installed_pkgs[, c("Package", "Version")]


Unnamed: 0,Package,Version
caret,caret,6.0-94
ipred,ipred,0.9-14
recipes,recipes,1.0.8
class,class,7.3-22
clock,clock,0.7.0
diagram,diagram,1.6.5
doParallel,doParallel,1.0.17
dplyr,dplyr,1.1.4
e1071,e1071,1.7-14
fastDummies,fastDummies,1.7.3


In [1]:
personal_lib_path <- "~/R/x86_64-pc-linux-gnu-library/4.2"
dir.create(personal_lib_path, recursive = TRUE, showWarnings = FALSE)
.libPaths(c(personal_lib_path, .libPaths()))
install.packages("glmnet", lib = personal_lib_path)


“installation of package ‘glmnet’ had non-zero exit status”


In [4]:
df_sparse = readRDS("df_sparse.rds")
df_combined = readRDS("df_combined.rds")
final_analysis_df = readRDS("final_analysis_df.rds")

In [6]:
str(df_sparse)
str(df_combined)
str(final_analysis_df)


In [None]:
library(glmnet)

# Save the identity and outcome variables separately
identity_var <- final_analysis_df$customer_unique_id
outcome_var <- final_analysis_df$RFM_avg

# Ensure that response_variable is correctly defined
response_variable <- outcome_var  # Make sure this is your desired outcome variable

# Fit LASSO model
lasso_model <- glmnet(df_sparse, response_variable, alpha = 1)

# Selecting the best lambda (regularization parameter)
cv_lasso <- cv.glmnet(df_sparse, response_variable, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min

# Fit LASSO model with best lambda
lasso_best <- glmnet(df_sparse, response_variable, alpha = 1, lambda = best_lambda_lasso)


In [None]:
# Fit Ridge model
ridge_model <- glmnet(df_sparse, response_variable, alpha = 0)

# Selecting the best lambda
cv_ridge <- cv.glmnet(df_sparse, response_variable, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min

# Fit Ridge model with best lambda
ridge_best <- glmnet(df_sparse, response_variable, alpha = 0, lambda = best_lambda_ridge)


In [None]:
# Plot LASSO coefficients
plot(lasso_best, xvar = "lambda", label = TRUE)

# Plot Ridge coefficients
plot(ridge_best, xvar = "lambda", label = TRUE)

# Plot cross-validation error for LASSO
plot(cv_lasso)

# Plot cross-validation error for Ridge
plot(cv_ridge)


In [None]:
# Save LASSO Coefficients Plot
png(file = "lasso_coefficients_plot.png")
plot(lasso_best, xvar = "lambda", label = TRUE)
dev.off()  # Close the device


In [None]:
# Save Ridge Coefficients Plot
png(file = "ridge_coefficients_plot.png")
plot(ridge_best, xvar = "lambda", label = TRUE)
dev.off()  # Close the device


In [None]:
# Save LASSO Cross-Validation Error Plot
png(file = "lasso_cv_error_plot.png")
plot(cv_lasso)
dev.off()  # Close the device


In [None]:
# Save Ridge Cross-Validation Error Plot
png(file = "ridge_cv_error_plot.png")
plot(cv_ridge)
dev.off()  # Close the device


In [None]:
# Assuming cv_lasso is the result from cv.glmnet
lasso_coefs <- coef(cv_lasso, s = "lambda.1se")

# Number of non-zero coefficients (excluding intercept)
non_zero_coefs <- sum(lasso_coefs[-1, ] != 0)

# Print the number of selected features
print(non_zero_coefs)


In [None]:
# Extract the coefficients for the value of lambda.1se 
lasso_coefs <- coef(cv_lasso, s = "lambda.1se")

# Coerce the matrix to a regular dense format
lasso_coefs_matrix <- as.matrix(lasso_coefs)

# Create a data frame from the matrix
lasso_coefs_df <- data.frame(
  Feature = rownames(lasso_coefs_matrix),
  Coefficient = lasso_coefs_matrix[, 1]
)

# Remove the intercept and select non-zero coefficients
selected_features <- lasso_coefs_df[lasso_coefs_df$Coefficient != 0 & lasso_coefs_df$Feature != "(Intercept)", , drop = FALSE]

# View the selected features and their coefficients
print(selected_features)

saveRDS(selected_features, "lasso_1se_features.rds")



In [None]:

# Extract non-zero coefficients from the lasso model at lambda.min
lasso_coefs_min <- coef(cv_lasso, s = "lambda.min")

# Convert the sparse matrix to a regular dense format and exclude the intercept
lasso_coefs_min_dense <- as.matrix(lasso_coefs_min)[-1, , drop = FALSE]  # Excluding the intercept

# Remove rows with zero coefficients
lasso_coefs_min_dense_nonzero = lasso_coefs_min_dense[lasso_coefs_min_dense[, 1] != 0, ]

print(lasso_coefs_min_dense_nonzero)

saveRDS(lasso_coefs_min_dense_nonzero, "lasso_min_nonzero_coefs.rds")

In [None]:
# Extract non-zero coefficients from the ridge model at lambda.min
ridge_coefs_min <- coef(cv_ridge, s = "lambda.min")

# Convert the sparse matrix to a regular dense format and exclude the intercept
ridge_coefs_min_dense <- as.matrix(ridge_coefs_min)[-1, , drop = FALSE]  # Excluding the intercept

# Remove rows with zero coefficients
ridge_coefs_min_dense_nonzero = ridge_coefs_min_dense[ridge_coefs_min_dense[, 1] != 0, ]

print(ridge_coefs_min_dense_nonzero)

saveRDS(ridge_coefs_min_dense_nonzero, "ridge_min_nonzero_coefs.rds")

In [None]:
# Inspect the first few selected variable names
print(head(selected_vars))

# Inspect the column names of df_sparse
print(colnames(df_sparse))

# Check if selected_vars are actually in df_sparse's column names
print(all(selected_vars %in% colnames(df_sparse)))


In [None]:
# Extract non-zero coefficients from the lasso model at lambda.min
lasso_coefs_min <- coef(cv_lasso, s = "lambda.min")

# Convert the sparse matrix to a regular dense format and exclude the intercept
lasso_coefs_min_dense <- as.matrix(lasso_coefs_min)[-1, , drop = FALSE]  # Excluding the intercept

# Remove rows with zero coefficients
lasso_coefs_min_dense_nonzero <- lasso_coefs_min_dense[lasso_coefs_min_dense[, 1] != 0, ]

# Extract the names of the variables with non-zero coefficients
selected_vars <- rownames(lasso_coefs_min_dense_nonzero)

# Check the content of selected_vars
print(selected_vars)


In [43]:
# Print a few selected variables
print(head(selected_vars))

# Manually subset a few columns
df_selected_test <- df_sparse[, colnames(df_sparse) %in% head(selected_vars)]
str(df_selected_test)

# Alternative subsetting
col_indices <- which(colnames(df_sparse) %in% selected_vars)
df_selected_alternative <- df_sparse[, col_indices, drop = FALSE]
str(df_selected_alternative)


character(0)
Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int(0) 
  ..@ p       : int 0
  ..@ Dim     : int [1:2] 94439 0
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:94439] "1.1" "1.2" "1.3" "1.4" ...
  .. ..$ : NULL
  ..@ x       : num(0) 
  ..@ factors : list()
Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int(0) 
  ..@ p       : int 0
  ..@ Dim     : int [1:2] 94439 0
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:94439] "1.1" "1.2" "1.3" "1.4" ...
  .. ..$ : NULL
  ..@ x       : num(0) 
  ..@ factors : list()


In [32]:
install.packages("caret")

Installing package into ‘/home/jupyter/R/x86_64-pc-linux-gnu-library/4.2’
(as ‘lib’ is unspecified)

also installing the dependencies ‘ipred’, ‘recipes’




In [38]:
str(df_selected)

 num(0) 


In [33]:
# Split df_selected into train and test datasets
library(caret)
set.seed(123)
trainIndex = createDataPartition(outcome_var, p = 0.8, list = FALSE)
train_data = df_selected[trainIndex, , drop = FALSE]
test_data = df_selected[-trainIndex, , drop = FALSE]
train_labels = outcome_var[trainIndex]
test_labels = outcome_var[-trainIndex]

Loading required package: ggplot2

Loading required package: lattice

“running command 'timedatectl' had status 1”


ERROR: Error in df_selected[trainIndex, , drop = FALSE]: incorrect number of dimensions


In [None]:
# Convert training data to xgboost