In [1]:
library(data.table)
library(tictoc)

counts_dt <- fread("bulk_counts_long.csv")
metadata_dt <- fread("sample_metadata.csv")
tic("esercizio8")

#TASK1
tic("task1")
setkey(counts_dt, "sample_id")
setkey(metadata_dt, "sample_id")
merged_dt <- counts_dt[metadata_dt[, .(sample_id, condition)], on = "sample_id", nomatch = 0]

setnames(merged_dt, "count", "expression") #expression=livello di espressione genica per sequenziamento

summary_stats_dt <- merged_dt[,
    .(
        mean = mean(expression),
        median = median(expression),
        Q1 = quantile(expression, 0.25),
        Q3 = quantile(expression, 0.75)
    ),
    by = .(gene, condition)
]

wide_stats_dt <- dcast(
    summary_stats_dt,
    gene ~ condition,
    value.var = c("mean", "median", "Q1", "Q3")
)
toc()
print("Risultato Task 1 (Statistiche riassuntive per gene e condizione):")
print(head(wide_stats_dt))

#TASK2
tic("task2")

filtered_genes_dt <- wide_stats_dt[mean_treated >= 2 * mean_control] #trasforma i dati da un formato "long" a un formato "wide" ed è fondamentale per la Task 2.
filtered_result <- filtered_genes_dt[, .(gene, mean_treated, mean_control)]
toc()
print("Risultato Task 2 (Geni che soddisfano mean_treated >= 2 * mean_control):")
print(filtered_result)

#fine timer generale
toc()


Attaching package: ‘tictoc’


The following object is masked from ‘package:data.table’:

    shift




task1: 0.259 sec elapsed
[1] "Risultato Task 1 (Statistiche riassuntive per gene e condizione):"
Key: <gene>
        gene mean_control mean_treated median_control median_treated Q1_control
      <char>        <num>        <num>          <int>          <int>      <num>
1: GENE_0000     6.111111     7.066667              6              6          5
2: GENE_0001     2.666667     2.533333              3              2          2
3: GENE_0002     4.000000     3.800000              4              3          3
4: GENE_0003    12.777778    20.266667             13             19          9
5: GENE_0004     8.444444     5.933333              8              7          8
6: GENE_0005    15.333333    14.000000             16             15         12
   Q1_treated Q3_control Q3_treated
        <num>      <num>      <num>
1:        5.0          9        7.5
2:        1.0          3        3.5
3:        2.0          5        5.0
4:       16.5         16       23.0
5:        4.0          9        7.5

In [2]:
# Data.frame (richiede la libreria reshape2 per la funzione dcast)
library(tictoc)
library(reshape2) 

# --- 0. Caricamento Dati ---
# Simulazione dati per garantire l'esecuzione
if (!exists("counts_df") || !exists("metadata_df")) {
    cat("Simulazione dati...\n")
    set.seed(42)
    N <- 500000 
    counts_df <- data.frame(sample_id = sample(paste0("S", 1:50), N, replace=TRUE),
                            gene = sample(paste0("G", 1:1000), N, replace=TRUE),
                            count = sample(10:1000, N, replace=TRUE), stringsAsFactors = FALSE)
    metadata_df <- data.frame(sample_id = paste0("S", 1:50),
                              condition = rep(c("treated", "control"), 25), stringsAsFactors = FALSE)
}

tic("esercizio8: data.frame (Correzione Finale)")

# TASK 1: Join, Ridenominazione, Statistiche Riassuntive e Reshape
tic("task1")

# 1. Join e Ridenominazione
merged_df <- merge(counts_df, metadata_df[, c("sample_id", "condition")], by = "sample_id", all.x = TRUE)
colnames(merged_df)[colnames(merged_df) == "count"] <- "expression"

# 2. Funzione Statistiche
stats_func <- function(x) {
    c(mean = mean(x), 
      median = median(x), 
      Q1 = quantile(x, 0.25), 
      Q3 = quantile(x, 0.75))
}

# 3. Aggregazione
summary_stats_list <- aggregate(
    expression ~ gene + condition,
    data = merged_df,
    FUN = stats_func
)

# 4. Conversione della lista aggregata in un dataframe pulito
summary_stats_df <- data.frame(
    gene = summary_stats_list$gene,
    condition = summary_stats_list$condition,
    summary_stats_list$expression
)
colnames(summary_stats_df) <- c("gene", "condition", "mean", "median", "Q1", "Q3")


# 5. Reshape Corretto (dcast con reshape2)
# Trasformiamo in formato lungo (melt)
wide_stats_df <- dcast(
    melt(summary_stats_df, id.vars = c("gene", "condition")),
    gene ~ condition + variable, # La formula crea: treated_mean, control_mean, treated_median, etc.
    value.var = "value"
)
toc() # Chiusura timer Task 1

print("Risultato Task 1 (Statistiche riassuntive per gene e condizione):")
print(head(wide_stats_df))


# --- TASK 2: Filtro sulle Medie (I NOMI SONO STATO CORRETTI QUI) ---
tic("task2")

# 1. Filtro
# Utilizziamo i nomi corretti creati da reshape2: treated_mean e control_mean
filtered_genes_df <- wide_stats_df[
    wide_stats_df$treated_mean >= 2 * wide_stats_df$control_mean,
]

# 2. Selezione Colonne
# Aggiorniamo i nomi delle colonne da selezionare
filtered_result_df <- filtered_genes_df[, c("gene", "treated_mean", "control_mean")]

toc() # Chiusura timer Task 2

print("Risultato Task 2 (Geni che soddisfano mean_treated >= 2 * mean_control):")
print(filtered_result_df)

# Fine timer generale
toc()


Attaching package: ‘reshape2’


The following objects are masked from ‘package:data.table’:

    dcast, melt




Simulazione dati...
task1: 2.134 sec elapsed
[1] "Risultato Task 1 (Statistiche riassuntive per gene e condizione):"
   gene control_mean control_median control_Q1 control_Q3 treated_mean
1    G1     501.6532          521.0     223.50     784.25     512.5381
2   G10     504.2983          523.0     252.75     728.75     531.3956
3  G100     509.4339          523.0     257.00     747.50     501.4770
4 G1000     494.7724          495.0     232.25     759.75     513.7255
5  G101     537.5000          567.5     323.25     762.75     505.9032
6  G102     497.4458          492.5     271.25     710.00     483.2323
  treated_median treated_Q1 treated_Q3
1            525     261.00     761.00
2            545     297.00     777.00
3            499     252.00     751.00
4            499     257.50     788.00
5            515     280.25     738.75
6            478     232.50     740.25
task2: 0.002 sec elapsed
[1] "Risultato Task 2 (Geni che soddisfano mean_treated >= 2 * mean_control):"
[1] gene 

In [3]:
#tabella confronto tempi
T_DT1 <- 0.259
T_DT2 <- 0.003
T_DF1 <- 2.134
T_DF2 <- 0.002

# Creazione della tabella riassuntiva
risultati_performance <- data.frame(
  Task = c("Task 1",
           "Task 2"),
  
  Tempo_data.table_Sec = c(T_DT1, T_DT2),
  Tempo_data.frame_Sec = c(T_DF1, T_DF2)
)

# Aggiungiamo una colonna per il fattore di velocizzazione (Speedup)
risultati_performance$Speedup_DT_vs_DF <- 
  round(risultati_performance$Tempo_data.frame_Sec / risultati_performance$Tempo_data.table_Sec, 1)

# Stampiamo la tabella finale
print(risultati_performance)

    Task Tempo_data.table_Sec Tempo_data.frame_Sec Speedup_DT_vs_DF
1 Task 1                0.259                2.134              8.2
2 Task 2                0.003                0.002              0.7
