In [1]:
#data.table
library(data.table)
library(tictoc)

tic("Esercizio11")

variants_dt <- fread("variants.csv")
genes_dt <- fread("gene_annotation.bed.csv")

setnames(genes_dt, c("chr", "start", "end", "gene"))

variants_dt[, chr := as.character(chr)]
genes_dt[, chr := as.character(chr)]

tic("Task1")

variants_dt[, `:=`(start = pos, end = pos)]
setkey(genes_dt, chr, start, end)

variants_genes_dt <- foverlaps(variants_dt, genes_dt,
                               by.x = c("chr", "start", "end"),
                               by.y = c("chr", "start", "end"),
                               type = "within",
                               mult = "all", # Mappa a tutti i geni sovrapposti
                               nomatch = NULL)

variants_genes_dt <- variants_genes_dt[, .(sample_id, chr, pos, ref, alt, impact, gene)]

cat("\n--- Risultato Task 1 (Estratto delle prime 5 righe) ---\n")
print(head(variants_genes_dt))

toc() 

#TASK2
tic("Task2")

high_impact_dt <- variants_genes_dt[impact == "HIGH"]

high_impact_gene_summary <- high_impact_dt[, .N, by = gene]
setnames(high_impact_gene_summary, "N", "count_high_impact")
high_impact_gene_summary <- high_impact_gene_summary[order(-count_high_impact)]

high_impact_sample_summary <- high_impact_dt[, .N, by = sample_id]
setnames(high_impact_sample_summary, "N", "count_high_impact")
high_impact_sample_summary <- high_impact_sample_summary[order(-count_high_impact)]

cat("\n Conteggi HIGH-Impact per Gene\n")
print(high_impact_gene_summary)

cat("\n Conteggi HIGH-Impact per Campione\n")
print(high_impact_sample_summary)

toc() 

#TASK3: 
tic("Task3")

total_samples <- uniqueN(variants_genes_dt$sample_id)

gene_sample_count <- high_impact_dt[, .(n_samples = uniqueN(sample_id)), by = gene]

genes_in_all_samples <- gene_sample_count[n_samples == total_samples, .(gene)]

cat("\n Geni con Varianti HIGH-Impact in TUTTI i Campioni\n")
if (nrow(genes_in_all_samples) > 0) {
    print(genes_in_all_samples)
} else {
    cat("Nessun gene con varianti HIGH-Impact è presente in tutti i", total_samples, "campioni.\n")
}

toc() 

# Fine del timer totale
toc()


Attaching package: ‘tictoc’


The following object is masked from ‘package:data.table’:

    shift





--- Risultato Task 1 (Estratto delle prime 5 righe) ---
   sample_id    chr     pos    ref    alt   impact      gene
      <char> <char>   <int> <char> <char>   <char>    <char>
1:       S12   chr1 1070757      G      C MODERATE GENE_0356
2:       S19   chr1 1677551      A      G     HIGH GENE_0049
3:       S16   chr1 1680769      G      G      LOW GENE_0049
4:       S15   chr1 1689687      C      C MODERATE GENE_0049
5:       S20   chr1 1693762      C      C      LOW GENE_0049
6:       S18   chr1 1717288      G      G MODERATE GENE_0245
Task1: 0.033 sec elapsed

 Conteggi HIGH-Impact per Gene
         gene count_high_impact
       <char>             <int>
 1: GENE_0049                 1
 2: GENE_0276                 1
 3: GENE_0417                 1
 4: GENE_0318                 1
 5: GENE_0309                 1
 6: GENE_0058                 1
 7: GENE_0164                 1
 8: GENE_0390                 1
 9: GENE_0453                 1
10: GENE_0473                 1
11: GENE_0008 

In [7]:
library(tictoc)

tic("Esercizio11_dataframe")

# Caricamento dati (Sostituzione di fread con read.csv)
# Assicurati che i nomi dei file siano corretti.
variants_df <- read.csv("variants.csv", stringsAsFactors = FALSE)
genes_df <- read.csv("gene_annotation.bed.csv", stringsAsFactors = FALSE)

# Rinominazione e pulizia colonne
# Assumiamo che la colonna genes_df X.chr, start, end e la quarta colonna senza nome siano chr, start, end, gene
names(genes_df)[1:4] <- c("chr", "start", "end", "gene")

# Conversione dei tipi (se necessario)
variants_df$chr <- as.character(variants_df$chr)
genes_df$chr <- as.character(genes_df$chr)

# ---

# TASK1: Mappatura delle varianti ai geni (Sostituzione di foverlaps)
tic("Task1_dataframe")

# Prepara le colonne start e end per le varianti (pos = start = end)
variants_df$start <- variants_df$pos
variants_df$end <- variants_df$pos

# Logica di intersezione manuale per tipo = "within" 
# (variante DEVE essere dentro il gene: start_gene <= pos_variante E end_gene >= pos_variante)
overlap_list <- list()
counter <- 1

# Esegue l'iterazione su ogni riga di variants_df
for (i in 1:nrow(variants_df)) {
  variant_row <- variants_df[i, ]
  
  # Filtra i geni sullo stesso cromosoma
  matching_genes <- genes_df[genes_df$chr == variant_row$chr, ]
  
  # Trova le sovrapposizioni (criterio 'within')
  overlaps <- matching_genes[
    matching_genes$start <= variant_row$pos & matching_genes$end >= variant_row$pos, 
  ]
  
  # Se ci sono sovrapposizioni, crea le righe di risultato
  if (nrow(overlaps) > 0) {
    for (j in 1:nrow(overlaps)) {
      gene_row <- overlaps[j, ]
      overlap_list[[counter]] <- data.frame(
        sample_id = variant_row$sample_id,
        chr = variant_row$chr,
        pos = variant_row$pos,
        ref = variant_row$ref,
        alt = variant_row$alt,
        impact = variant_row$impact,
        gene = gene_row$gene,
        stringsAsFactors = FALSE
      )
      counter <- counter + 1
    }
  }
}

# Combina tutti i risultati in un unico data.frame
# Uso tryCatch nel caso il risultato sia vuoto (nessuna sovrapposizione)
variants_genes_df <- tryCatch({
  do.call(rbind, overlap_list)
}, error = function(e) {
  message("Nessuna variante trovata all'interno di un gene. Creazione di un data.frame vuoto.")
  data.frame(sample_id=character(), chr=character(), pos=numeric(), ref=character(), alt=character(), impact=character(), gene=character())
})


# Seleziona le colonne finali
variants_genes_df <- variants_genes_df[, c("sample_id", "chr", "pos", "ref", "alt", "impact", "gene")]

cat("\n--- Risultato Task 1 (Estratto delle prime 5 righe) ---\n")
print(head(variants_genes_df, 5))

toc()  

# ---

# TASK2: Conteggi delle varianti HIGH-Impact
tic("Task2_dataframe")

# Filtra le varianti HIGH-Impact
high_impact_df <- variants_genes_df[variants_genes_df$impact == "HIGH", ]

# Conteggi HIGH-Impact per Gene
high_impact_gene_summary <- aggregate(
  high_impact_df$impact, 
  by = list(gene = high_impact_df$gene), 
  FUN = length
)
names(high_impact_gene_summary)[names(high_impact_gene_summary) == "x"] <- "count_high_impact"

# Ordina
high_impact_gene_summary <- high_impact_gene_summary[order(high_impact_gene_summary$count_high_impact, decreasing = TRUE), ]

# Conteggi HIGH-Impact per Campione
high_impact_sample_summary <- aggregate(
  high_impact_df$impact, 
  by = list(sample_id = high_impact_df$sample_id), 
  FUN = length
)
# *** Nota: Questa è la sezione dove spesso manca la parentesi di chiusura ')' ***
names(high_impact_sample_summary)[names(high_impact_sample_summary) == "x"] <- "count_high_impact"

# Ordina
high_impact_sample_summary <- high_impact_sample_summary[order(high_impact_sample_summary$count_high_impact, decreasing = TRUE), ]


cat("\n Conteggi HIGH-Impact per Gene\n")
print(head(high_impact_gene_summary))

cat("\n Conteggi HIGH-Impact per Campione\n")
print(head(high_impact_sample_summary))

toc()  

# ---

# TASK3: Geni con Varianti HIGH-Impact in TUTTI i Campioni
tic("Task3_dataframe")

# Calcola il numero totale di campioni unici
total_samples <- length(unique(variants_genes_df$sample_id))

# Conta il numero di campioni unici per ogni gene con HIGH-Impact
gene_sample_count <- aggregate(
  high_impact_df$sample_id,
  by = list(gene = high_impact_df$gene),
  FUN = function(x) length(unique(x)) # Conta i campioni UNICI
)
names(gene_sample_count)[names(gene_sample_count) == "x"] <- "n_samples"

# Filtra i geni che hanno un conteggio uguale al totale dei campioni
genes_in_all_samples <- gene_sample_count[gene_sample_count$n_samples == total_samples, ]
genes_in_all_samples <- genes_in_all_samples[, c("gene"), drop = FALSE] # Seleziona solo la colonna 'gene'

cat("\n Geni con Varianti HIGH-Impact in TUTTI i Campioni\n")
if (nrow(genes_in_all_samples) > 0) {
    print(genes_in_all_samples)
} else {
    cat("Nessun gene con varianti HIGH-Impact è presente in tutti i", total_samples, "campioni.\n")
}

toc()  

# Fine del timer totale
toc()


--- Risultato Task 1 (Estratto delle prime 5 righe) ---
  sample_id  chr     pos ref alt   impact      gene
1       S12 chr1 1070757   G   C MODERATE GENE_0356
2       S19 chr1 1677551   A   G     HIGH GENE_0049
3       S16 chr1 1680769   G   G      LOW GENE_0049
4       S15 chr1 1689687   C   C MODERATE GENE_0049
5       S20 chr1 1693762   C   C      LOW GENE_0049
Task1_dataframe: 0.514 sec elapsed

 Conteggi HIGH-Impact per Gene
       gene count_high_impact
1 GENE_0008                 1
2 GENE_0009                 1
3 GENE_0036                 1
4 GENE_0049                 1
5 GENE_0058                 1
6 GENE_0069                 1

 Conteggi HIGH-Impact per Campione
   sample_id count_high_impact
14       S18                 5
9        S13                 4
4        S05                 3
5        S07                 3
8        S12                 3
13       S17                 3
Task2_dataframe: 0.022 sec elapsed

 Geni con Varianti HIGH-Impact in TUTTI i Campioni
Nessun gene co

In [8]:
#tabella confronto tempi
T_DT1 <- 0.093
T_DT2 <- 0.031
T_DT3 <- 0.010
T_DF1 <- 0.514
T_DF2 <- 0.022
T_DF3 <- 0.010

# Creazione della tabella riassuntiva
risultati_performance <- data.frame(
  Task = c("Task 1",
           "Task 2",
           "Task 3"),
  
  Tempo_data.table_Sec = c(T_DT1, T_DT2, T_DT3),
  Tempo_data.frame_Sec = c(T_DF1, T_DF2, T_DF3)
)

# Aggiungiamo una colonna per il fattore di velocizzazione (Speedup)
risultati_performance$Speedup_DT_vs_DF <- 
  round(risultati_performance$Tempo_data.frame_Sec / risultati_performance$Tempo_data.table_Sec, 1)

# Stampiamo la tabella finale
print(risultati_performance)

    Task Tempo_data.table_Sec Tempo_data.frame_Sec Speedup_DT_vs_DF
1 Task 1                0.093                0.514              5.5
2 Task 2                0.031                0.022              0.7
3 Task 3                0.010                0.010              1.0
