In [1]:
#data.table
library(data.table)
library(tictoc)

#inizio timer generale
tic("esercizio6")

#TASK1
tic("task1")
labs_dt <- fread("clinical_labs.csv")
vitals_dt <- fread("vitals_time_series.csv")

# Converte le colonne temporali da stringa a formato POSIXct (essenziale per join temporali)
labs_dt[, time_iso := as.POSIXct(time_iso, format = "%Y-%m-%d %H:%M:%S")] #anno,mese,gg,ore,min,sec
vitals_dt[, time_iso := as.POSIXct(time_iso, format = "%Y-%m-%d %H:%M:%S")]

# Trasforma i dati vitali in formato "largo" (wide) per avere HR e SBP su colonne diverse
vitals_wide_dt <- dcast( # dcast, serve per avere più colonne
  vitals_dt,
  patient_id + time_iso ~ vital, 
  value.var = "value"
) 
rm(vitals_dt) #cancella la colonna dove c'era scritto HR e SBP

setkey(vitals_wide_dt, patient_id, time_iso) #chiavi:pazinete e tempo

#Nearest-Time Rolling Join:
final_result_dt <- vitals_wide_dt[
  labs_dt,
  on = c("patient_id", "time_iso"),
  roll = "nearest",
  rollends = TRUE,
  # Utilizziamo 'j' per selezionare esplicitamente le colonne e calcolare il lag.
  j = .(
    patient_id,
    lab_time = time_iso,         # Tempo Lab (dalla tabella interna I)
    lab_name = lab,
    lab_value = value,
    vital_time = x.time_iso,     # Tempo Vitale Abbinato (dalla tabella esterna X)
    HR = HR,                     # Valore HR abbinato
    SBP = SBP,                   # Valore SBP abbinato
    # Calcolo del lag: Lab Time - Vital Time
    time_lag_min = as.numeric(difftime(time_iso, x.time_iso, units = "mins"))
  ),
  nomatch = NULL
]

print("Risultato del Nearest-Time Matching")
print(head(final_result_dt, 10))

fwrite(final_result_dt, "matched_labs_vitals.csv") # Scrivi il risultato su un file CSV
toc()

#TASK2
tic("task2")
final_dt <- fread("matched_labs_vitals.csv")
crp_dt <- final_dt[lab_name == "CRP"] # Filtra solo le righe relative al CRP (Proteina C Reattiva)

# Calcola le correlazioni (R di Pearson) per ciascun paziente
# group by (by): patient_id
# j (calcolo): Correlazione tra lab_value (CRP) e HR/SBP abbinati.
correlation_summary_dt <- crp_dt[, .(
  # Correlazione tra CRP e Frequenza Cardiaca (HR)
  corr_CRP_HR = cor(lab_value, HR, use = "pairwise.complete.obs"),
  
  # Correlazione tra CRP e Pressione Sanguigna Sistolica (SBP)
  corr_CRP_SBP = cor(lab_value, SBP, use = "pairwise.complete.obs")
), by = patient_id]

print("Correlazione tra CRP e Vitals Abbinati, per Paziente:")
print(correlation_summary_dt)

fwrite(correlation_summary_dt, "correlation_summary_crp_vitals.csv") # Salvataggio del risultato
toc()

#fine timer generale
toc()


Attaching package: ‘tictoc’


The following object is masked from ‘package:data.table’:

    shift




[1] "Risultato del Nearest-Time Matching"
    patient_id            lab_time lab_name lab_value          vital_time
        <char>              <POSc>   <char>     <num>              <POSc>
 1:       P002 2025-09-10 13:00:00      CRP  0.100000 2025-09-09 21:00:00
 2:       P002 2025-09-10 13:00:00      WBC  6.332412 2025-09-09 21:00:00
 3:       P002 2025-09-14 09:00:00      CRP  9.709428 2025-09-13 08:00:00
 4:       P002 2025-09-14 09:00:00      WBC  8.108361 2025-09-13 08:00:00
 5:       P002 2025-09-16 09:00:00      CRP  6.359407 2025-09-15 10:00:00
 6:       P002 2025-09-16 09:00:00      WBC  5.464871 2025-09-15 10:00:00
 7:       P002 2025-09-21 08:00:00      CRP  0.100000 2025-09-22 00:00:00
 8:       P002 2025-09-21 08:00:00      WBC  6.078082 2025-09-22 00:00:00
 9:       P002 2025-09-24 13:00:00      CRP  5.632359 2025-09-24 03:00:00
10:       P002 2025-09-24 13:00:00      WBC  9.192073 2025-09-24 03:00:00
          HR      SBP time_lag_min
       <num>    <num>        <num>


In [2]:
#data.frame
library(dplyr)
library(tidyr)
library(tictoc)

# Inizio timer generale
tic("esercizio6 DF")

# TASK 1: Nearest-Time Matching
tic("task1 DF")

# Carica i dati (usando read.csv)
labs_df <- read.csv("clinical_labs.csv", stringsAsFactors = FALSE)
vitals_df <- read.csv("vitals_time_series.csv", stringsAsFactors = FALSE)

# 1. Conversione e Preparazione Temporale (Usando as.POSIXct di Base R)
labs_df <- labs_df %>%
  mutate(lab_time = as.POSIXct(time_iso, format = "%Y-%m-%d %H:%M:%S")) %>%
  select(-time_iso)

vitals_df <- vitals_df %>%
  mutate(vital_time = as.POSIXct(time_iso, format = "%Y-%m-%d %H:%M:%S")) %>%
  select(-time_iso)

# 2. Trasformazione Vital Signs (Da Long a Wide)
vitals_wide_df <- vitals_df %>%
  pivot_wider(
    names_from = vital,
    values_from = value
  ) %>%
  rename(vital_time_match = vital_time)

rm(vitals_df)

# 3. Simulazione del Nearest-Time Rolling Join
matched_df <- labs_df %>%
  # Join cartesiana (all.x=TRUE mantiene tutti i lab draws)
  left_join(vitals_wide_df, by = "patient_id") %>%
  
  # Calcola il time lag e il suo valore assoluto
  mutate(
    # difftime è una funzione base R e funziona con POSIXct
    time_lag_min = as.numeric(difftime(lab_time, vital_time_match, units = "mins")),
    abs_time_diff = abs(time_lag_min)
  ) %>%
  
  # Trova l'osservazione con la minima differenza di tempo per ciascun lab draw
  group_by(patient_id, lab_time, lab) %>%
  filter(abs_time_diff == min(abs_time_diff)) %>%
  ungroup() %>%
  
  # Rimuove duplicati (in caso di abbinamenti equidistanti)
  distinct(patient_id, lab_time, lab, .keep_all = TRUE) %>%
  
  # Pulizia e selezione delle colonne finali
  select(
    patient_id,
    lab_time,
    lab_name = lab,
    lab_value = value,
    vital_time = vital_time_match,
    HR,
    SBP,
    time_lag_min
  )

write.csv(matched_df, "matched_labs_vitals_df_base.csv", row.names = FALSE)
print("Risultato del Nearest-Time Matching (prime 10 righe):")
print(head(matched_df, 10))

toc()


#TASK 2
tic("task2 DF")

# 1. Filtra solo le righe relative al CRP
crp_df <- matched_df %>%
  filter(lab_name == "CRP")

# 2. Calcola le correlazioni (R di Pearson) per ciascun paziente
correlation_summary_df <- crp_df %>%
  group_by(patient_id) %>%
  summarise(
    corr_CRP_HR = cor(lab_value, HR, use = "pairwise.complete.obs"),
    corr_CRP_SBP = cor(lab_value, SBP, use = "pairwise.complete.obs"),
    .groups = 'drop'
  )

print("Correlazione tra CRP e Vitals Abbinati, per Paziente (Data Frame/Tidyverse):")
print(correlation_summary_df)

write.csv(correlation_summary_df, "correlation_summary_crp_vitals_df_base.csv", row.names = FALSE)

toc()

# Fine timer generale
toc()


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 1 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 1 of `y` matches multiple rows in `x`.
[36mℹ[39m If a many-to-many relationship is expected, set `relationship =


[1] "Risultato del Nearest-Time Matching (prime 10 righe):"
[90m# A tibble: 10 × 8[39m
   patient_id lab_time            lab_name lab_value vital_time             HR
   [3m[90m<chr>[39m[23m      [3m[90m<dttm>[39m[23m              [3m[90m<chr>[39m[23m        [3m[90m<dbl>[39m[23m [3m[90m<dttm>[39m[23m              [3m[90m<dbl>[39m[23m
[90m 1[39m P002       2025-09-10 [90m13:00:00[39m CRP           0.1  2025-09-09 [90m21:00:00[39m  84.4
[90m 2[39m P002       2025-09-10 [90m13:00:00[39m WBC           6.33 2025-09-09 [90m21:00:00[39m  84.4
[90m 3[39m P002       2025-09-14 [90m09:00:00[39m CRP           9.71 2025-09-13 [90m08:00:00[39m  91.7
[90m 4[39m P002       2025-09-14 [90m09:00:00[39m WBC           8.11 2025-09-13 [90m08:00:00[39m  91.7
[90m 5[39m P002       2025-09-16 [90m09:00:00[39m CRP           6.36 2025-09-15 [90m10:00:00[39m  86.0
[90m 6[39m P002       2025-09-16 [90m09:00:00[39m WBC           5.46 2025-09-15 [90m10:00

In [3]:
#tabella confronto tempi
T_DT1 <- 0.146
T_DT2 <- 0.037
T_DF1 <- 0.222
T_DF2 <- 0.035

# Creazione della tabella riassuntiva
risultati_performance <- data.frame(
  Task = c("Task 1",
           "Task 2"),
  
  Tempo_data.table_Sec = c(T_DT1, T_DT2),
  Tempo_data.frame_Sec = c(T_DF1, T_DF2)
)

# Aggiungiamo una colonna per il fattore di velocizzazione (Speedup)
risultati_performance$Speedup_DT_vs_DF <- 
  round(risultati_performance$Tempo_data.frame_Sec / risultati_performance$Tempo_data.table_Sec, 1)

# Stampiamo la tabella finale
print(risultati_performance)

    Task Tempo_data.table_Sec Tempo_data.frame_Sec Speedup_DT_vs_DF
1 Task 1                0.146                0.222              1.5
2 Task 2                0.037                0.035              0.9
