<a href="https://colab.research.google.com/github/valerievossen/valerievossen/blob/main/parallel_computing_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A notebook containing the example code used to show the efficiency of parallel computing.

Link to topic: https://tilburgsciencehub.com/topics/computer-setup/develop-coding-skills/coding-practices/parallel-computing/

Author: Valerie Vossen

In [6]:
# Load packages and data
library(parallel)
library(datasets)
data(iris)
library(dplyr)



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [9]:
# Define function
calculate_mean_sepal_length <- function(df) {
  species <- unique(df$Species)
  means <- vector("numeric", length = length(species))
  for (i in 1:length(species)) {
    Sys.sleep(2) # Simulating a 2-second computation
    means[i] <- mean(df[df$Species == species[i], "Sepal.Length"])
  }
  return(data.frame(Species = species, Mean_Sepal_Length = means))
}

In [10]:
# Without parallelization (sequential computation)
start_time_normal <- Sys.time()
result_normal <- calculate_mean_sepal_length(iris)
end_time_normal <- Sys.time()

time_normal <- end_time_normal - start_time_normal
result_normal

Species,Mean_Sepal_Length
<fct>,<dbl>
setosa,5.006
versicolor,5.936
virginica,6.588


In [11]:
# With parallelization using mclapply
start_time_multi <- Sys.time()

result_multi <- bind_rows(
  mclapply(
    split(iris, iris$Species), calculate_mean_sepal_length
    )
)
end_time_multi <- Sys.time()

time_multi <- end_time_multi - start_time_multi


In [12]:
# Compare times
time_normal
time_multi

Time difference of 6.011784 secs

Time difference of 4.037632 secs

In [15]:
# Socket approach

# Determine number of cores available
perc_use <- 0.8
ncpu <- floor(detectCores(all.tests = FALSE, logical = TRUE) * perc_use)

# Create a parallel cluster
cl <- makePSOCKcluster(ncpu)

# Export function to cluster
clusterExport(cl, 'calculate_mean_sepal_length')

# Parallel computation function
calculate_mean_sepal_length_parallel <- function(df) {
  clusterApplyLB(cl, split(df, df$Species), calculate_mean_sepal_length)
}

# Run parallel computation
start_time_socket <- Sys.time()
result_socket <- bind_rows(
  calculate_mean_sepal_length_parallel(iris)
  )
end_time_socket <- Sys.time()
time_socket <- end_time_socket - start_time_socket

In [16]:
# Compare times
time_normal
time_socket

Time difference of 6.011784 secs

Time difference of 6.062877 secs