In [4]:
library(data.table)
library(tictoc)
library(ggplot2) # Per la visualizzazione (Task 4)

tic("revision")

integration_dt <- fread("annotated_GSM3516673_normal_annotated_GSM3516672_tumor_SeuratIntegration.csv")
annotation_dt <- fread("nt_combined_clustering.output.csv")

integration_dt[, cell_clean := gsub("_[XY]_", "", cell)]

combined_dt <- merge(
    integration_dt,
    annotation_dt,
    by.x = "cell_clean",
    by.y = "cell",
    all.x = TRUE # Mantieni tutte le righe del file di integrazione
)

combined_dt[, `:=`(cell = cell_clean, cell_clean = NULL)] # Rinomina e rimuovi la colonna temporanea
combined_dt <- combined_dt[, c("cell", "integration_cluster", "cell_type", "sample_type")] # Riordina

#TASK1
tic("Task1")

fwrite(combined_dt, "combined_analysis_data.csv")

toc() 

#TASK2
tic("Task2")

counts_by_cluster_celltype <- combined_dt[, .(cell_count = .N), by = .(integration_cluster, cell_type)]

fwrite(counts_by_cluster_celltype, "cell_type_counts_per_cluster.csv")


print(head(counts_by_cluster_celltype,5))

toc() 

#TASK3
tic("Task3")

summary_table <- combined_dt[, .(cell_count = .N), by = .(integration_cluster, cell_type, sample_type)]
fwrite(summary_table, "summary_cluster_celltype_tissue.csv")
print(head(summary_table,5))

toc() 

#TASK4
tic("Task4")

plot_data <- summary_table[, 
    .(proportion = cell_count / sum(cell_count), 
      sample_type = sample_type,
      cell_count = cell_count), 
    by = .(integration_cluster, cell_type)
]

# Crea il grafico con ggplot2
plot_distribution <- ggplot(plot_data, aes(
    x = as.factor(integration_cluster), # Cluster sull'asse X
    y = proportion,                    # Proporzione sull'asse Y
    fill = sample_type                 # Colore in base al tipo di tessuto (N/T)
)) +
    geom_bar(stat = "identity") +
    facet_wrap(~ cell_type, scales = "free_y") + # Facet per tipo di cellula
    labs(
        title = "Distribuzione del Tipo di Tessuto (N/T) per Cell Type e Cluster",
        x = "Integration Cluster",
        y = "Proporzione di Tessuto (N vs T)",
        fill = "Tipo di Tessuto"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("cell_type_distribution_by_cluster_and_tissue.png", plot_distribution, width = 14, height = 8)

toc()

#TASK5
tic("Task5")

total_per_group <- summary_table[, .(total_cells_in_group = sum(cell_count)), 
                                 by = .(integration_cluster, sample_type)]

normalized_dt <- merge(summary_table, total_per_group, by = c("integration_cluster", "sample_type"))
normalized_dt[, percentage_within_group := (cell_count / total_cells_in_group) * 100]


fwrite(normalized_dt, "normalized_cell_type_percentages.csv")
print(head(normalized_dt,5))

toc() 

#fine timer generale
toc()

Task1: 0.025 sec elapsed
   integration_cluster                    cell_type cell_count
                 <int>                       <char>      <int>
1:                   2  Pro-angiogenesis Macrophage        842
2:                   8               Non Blood Cell         36
3:                   1       Exhausted CD8+ T cells        437
4:                   0 Effector/Memory CD4+ T cells        532
5:                   1                   MAIT cells          2
Task2: 0.038 sec elapsed
   integration_cluster                    cell_type sample_type cell_count
                 <int>                       <char>      <char>      <int>
1:                   2  Pro-angiogenesis Macrophage           N        836
2:                   8               Non Blood Cell           N         18
3:                   1       Exhausted CD8+ T cells           T        437
4:                   0 Effector/Memory CD4+ T cells           N        463
5:                   1                   MAIT cells        

In [6]:
library(tictoc)
library(ggplot2)
library(dplyr) 

# ==============================================================================
# ‚è±Ô∏è Avvio Timer Esercizio Totale
# ==============================================================================
tic("revision_dataframe")

# --- Caricamento Dati ---
integration_df <- read.csv("annotated_GSM3516673_normal_annotated_GSM3516672_tumor_SeuratIntegration.csv", stringsAsFactors = FALSE)
annotation_df <- read.csv("nt_combined_clustering.output.csv", stringsAsFactors = FALSE)

# ==============================================================================
# PRE-ELABORAZIONE E MERGE
# ==============================================================================
tic("Pre-elaborazione e Merge")

# 1. Pulizia ID Cella (crea cell_clean)
integration_df$cell_clean <- gsub("_[XY]_", "", integration_df$cell)

# 2. Merge dei DataFrames
combined_df <- merge(
    integration_df,
    annotation_df,
    by.x = "cell_clean",
    by.y = "cell",
    all.x = TRUE
)

# 3. Pulizia e Riordino Colonne (CORREZIONE APPLICATA QUI)
# Seleziona le colonne finali, rinominando cell_clean in cell.
# Questo omette automaticamente la vecchia colonna 'cell' non pulita e risolve l'errore.
combined_df <- combined_df %>%
    select(cell = cell_clean, integration_cluster, cell_type, sample_type)

toc() 

# ==============================================================================
# üìù TASK 1: Salvataggio del file combinato
# ==============================================================================
tic("Task1")

write.csv(combined_df, "combined_analysis_data_df.csv", row.names = FALSE)
cat("\n--- Risultato Task 1 (Head data.frame): ---\n")
print(head(combined_df, 5))

toc() 

# ==============================================================================
# üìù TASK 2: Conteggio Cell Type per Cluster
# ==============================================================================
tic("Task2")

counts_by_cluster_celltype_df <- combined_df %>%
    group_by(integration_cluster, cell_type) %>%
    summarise(cell_count = n(), .groups = "drop")

write.csv(counts_by_cluster_celltype_df, "cell_type_counts_per_cluster_df.csv", row.names = FALSE)

cat("\n--- Risultato Task 2 (Head data.frame): ---\n")
print(head(counts_by_cluster_celltype_df, 5))

toc() 

# ==============================================================================
# üìù TASK 3: Tabella Riepilogativa (Cluster/Cell Type/Tissue)
# ==============================================================================
tic("Task3")

summary_table_df <- combined_df %>%
    group_by(integration_cluster, cell_type, sample_type) %>%
    summarise(cell_count = n(), .groups = "drop")

write.csv(summary_table_df, "summary_cluster_celltype_tissue_df.csv", row.names = FALSE)

cat("\n--- Risultato Task 3 (Head data.frame): ---\n")
print(head(summary_table_df, 5))

toc() 

# ==============================================================================
# üìù TASK 4: Generazione Plot (Distribuzione Tissue in Cell Type)
# ==============================================================================
# *NOTA*: Il calcolo della proporzione qui sotto (cell_count / sum(cell_count)) 
# calcola la proporzione di TESSUTO (N vs T) all'interno di ogni (Cluster x Cell_Type).
# Non √® la percentuale normalizzata per il Task 5.
# ==============================================================================
tic("Task4")

plot_data_df <- summary_table_df %>%
    group_by(integration_cluster, cell_type) %>%
    mutate(proportion = cell_count / sum(cell_count)) %>%
    ungroup()

plot_distribution <- ggplot(plot_data_df, aes(
    x = as.factor(integration_cluster),
    y = proportion,
    fill = sample_type
)) +
    geom_bar(stat = "identity") +
    facet_wrap(~ cell_type, scales = "free_y") + 
    labs(
        title = "Distribuzione del Tipo di Tessuto (N/T) per Cell Type e Cluster",
        x = "Integration Cluster",
        y = "Proporzione di Tessuto (N vs T)",
        fill = "Tipo di Tessuto"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("cell_type_distribution_by_cluster_and_tissue_df.png", plot_distribution, width = 14, height = 8)

toc()

# ==============================================================================
# üìù TASK 5: Percentuale Normalizzata (Cell Type in Cluster/Tissue)
# ==============================================================================
tic("Task5")

# Calcola la percentuale di Cell Type all'interno di (Cluster x Tissue)
normalized_df <- summary_table_df %>%
    group_by(integration_cluster, sample_type) %>%
    mutate(
        total_cells_in_group = sum(cell_count),
        percentage_within_group = (cell_count / total_cells_in_group) * 100
    ) %>%
    ungroup()

write.csv(normalized_df, "normalized_cell_type_percentages_df.csv", row.names = FALSE)

cat("\n--- Risultato Task 5 (Head data.frame): ---\n")
print(head(normalized_df, 5))

toc() 

# ==============================================================================
# ‚è±Ô∏è Fine Timer Esercizio Totale
# ==============================================================================
toc()

Pre-elaborazione e Merge: 0.058 sec elapsed

--- Risultato Task 1 (Head data.frame): ---
                cell integration_cluster                    cell_type
1 X120703408789411.N                   2  Pro-angiogenesis Macrophage
2 X120703408793835.N                   8               Non Blood Cell
3 X120703408884123.T                   1       Exhausted CD8+ T cells
4 X120703409145716.N                   0 Effector/Memory CD4+ T cells
5 X120703409339181.N                   1                   MAIT cells
  sample_type
1           N
2           N
3           T
4           N
5           N
Task1: 0.084 sec elapsed

--- Risultato Task 2 (Head data.frame): ---
[90m# A tibble: 5 √ó 3[39m
  integration_cluster cell_type         cell_count
                [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m                  [3m[90m<int>[39m[23m
[90m1[39m                   0 B Cell                     2
[90m2[39m                   0 Blood Cell               120
[90m3[39m                

In [7]:
library(tictoc)
library(ggplot2)
library(dplyr) # La libreria richiesta per la manipolazione di data.frame

# ==============================================================================
# ‚è±Ô∏è Avvio Timer Esercizio Totale
# ==============================================================================
tic("revision_dplyr")

# --- Caricamento Dati ---
# Usiamo read.csv per caricare i dati direttamente in data.frame
integration_df <- read.csv("annotated_GSM3516673_normal_annotated_GSM3516672_tumor_SeuratIntegration.csv", stringsAsFactors = FALSE)
annotation_df <- read.csv("nt_combined_clustering.output.csv", stringsAsFactors = FALSE)

# ==============================================================================
# PRE-ELABORAZIONE E MERGE
# ==============================================================================
tic("Pre-elaborazione e Merge")

# 1. Pulizia ID Cella (crea cell_clean)
# Usiamo gsub (funzione base R) per la manipolazione delle stringhe
integration_df$cell_clean <- gsub("_[XY]_", "", integration_df$cell)

# 2. Merge dei DataFrames (funzione merge base R)
combined_df <- merge(
    integration_df,
    annotation_df,
    by.x = "cell_clean",
    by.y = "cell",
    all.x = TRUE
)

# 3. Pulizia e Riordino Colonne (Correzione Errore 'rename')
# Selezioniamo le colonne finali, rinominando 'cell_clean' in 'cell'.
# La vecchia colonna 'cell' (non pulita) viene automaticamente omessa.
combined_df <- combined_df %>%
    select(cell = cell_clean, integration_cluster, cell_type, sample_type)

toc() 

# ==============================================================================
# üìù TASK 1: Salvataggio del file combinato
# ==============================================================================
tic("Task1")

write.csv(combined_df, "combined_analysis_data_df.csv", row.names = FALSE)
cat("\n--- Risultato Task 1 (Head dplyr): ---\n")
print(head(combined_df, 5))

toc() 

# ==============================================================================
# üìù TASK 2: Conteggio Cell Type per Cluster
# ==============================================================================
tic("Task2")

counts_by_cluster_celltype_df <- combined_df %>%
    group_by(integration_cluster, cell_type) %>% # dplyr group_by
    summarise(cell_count = n(), .groups = "drop") # dplyr summarise (.N in data.table)

write.csv(counts_by_cluster_celltype_df, "cell_type_counts_per_cluster_df.csv", row.names = FALSE)

cat("\n--- Risultato Task 2 (Head dplyr): ---\n")
print(head(counts_by_cluster_celltype_df, 5))

toc() 

# ==============================================================================
# üìù TASK 3: Tabella Riepilogativa (Cluster/Cell Type/Tissue)
# ==============================================================================
tic("Task3")

summary_table_df <- combined_df %>%
    group_by(integration_cluster, cell_type, sample_type) %>%
    summarise(cell_count = n(), .groups = "drop")

write.csv(summary_table_df, "summary_cluster_celltype_tissue_df.csv", row.names = FALSE)

cat("\n--- Risultato Task 3 (Head dplyr): ---\n")
print(head(summary_table_df, 5))

toc() 

# ==============================================================================
# üìù TASK 4: Generazione Plot (Distribuzione Tissue in Cell Type)
# ==============================================================================
tic("Task4")

# Calcola la proporzione di tessuto (N vs T) all'interno di ogni gruppo (Cluster, Cell_Type)
plot_data_df <- summary_table_df %>%
    group_by(integration_cluster, cell_type) %>%
    mutate(proportion = cell_count / sum(cell_count)) %>% # dplyr mutate
    ungroup()

# Il codice ggplot rimane invariato
plot_distribution <- ggplot(plot_data_df, aes(
    x = as.factor(integration_cluster),
    y = proportion,
    fill = sample_type
)) +
    geom_bar(stat = "identity") +
    facet_wrap(~ cell_type, scales = "free_y") + 
    labs(
        title = "Distribuzione del Tipo di Tessuto (N/T) per Cell Type e Cluster",
        x = "Integration Cluster",
        y = "Proporzione di Tessuto (N vs T)",
        fill = "Tipo di Tessuto"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("cell_type_distribution_by_cluster_and_tissue_df.png", plot_distribution, width = 14, height = 8)

toc()

# ==============================================================================
# üìù TASK 5: Percentuale Normalizzata (Cell Type in Cluster/Tissue)
# ==============================================================================
tic("Task5")

# Calcola la percentuale di Cell Type all'interno di (Cluster x Tissue)
normalized_df <- summary_table_df %>%
    group_by(integration_cluster, sample_type) %>%
    mutate(
        total_cells_in_group = sum(cell_count),
        percentage_within_group = (cell_count / total_cells_in_group) * 100
    ) %>%
    ungroup()

write.csv(normalized_df, "normalized_cell_type_percentages_df.csv", row.names = FALSE)

cat("\n--- Risultato Task 5 (Head dplyr): ---\n")
print(head(normalized_df, 5))

toc() 

# ==============================================================================
# ‚è±Ô∏è Fine Timer Esercizio Totale
# ==============================================================================
toc()

Pre-elaborazione e Merge: 0.054 sec elapsed

--- Risultato Task 1 (Head dplyr): ---
                cell integration_cluster                    cell_type
1 X120703408789411.N                   2  Pro-angiogenesis Macrophage
2 X120703408793835.N                   8               Non Blood Cell
3 X120703408884123.T                   1       Exhausted CD8+ T cells
4 X120703409145716.N                   0 Effector/Memory CD4+ T cells
5 X120703409339181.N                   1                   MAIT cells
  sample_type
1           N
2           N
3           T
4           N
5           N
Task1: 0.083 sec elapsed

--- Risultato Task 2 (Head dplyr): ---
[90m# A tibble: 5 √ó 3[39m
  integration_cluster cell_type         cell_count
                [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m                  [3m[90m<int>[39m[23m
[90m1[39m                   0 B Cell                     2
[90m2[39m                   0 Blood Cell               120
[90m3[39m                   0 CD4 o

In [9]:
#tabella confronto tempi
T_DT1 <- 0.025
T_DT2 <- 0.038
T_DT3 <- 0.030
T_DT4 <- 5.530
T_DT5 <- 0.024
T_DF1 <- 0.084
T_DF2 <- 0.245
T_DF3 <- 0.121
T_DF4 <- 5.824
T_DF5 <- 0.062
T_DP1 <- 0.083
T_DP2 <- 0.069
T_DP3 <- 0.059
T_DP4 <- 5.148
T_DP5 <- 0.055

# Creazione della tabella riassuntiva
risultati_performance <- data.frame(
  Task = c("Task 1",
           "Task 2",
          "Task 3",
          "Task 4",
          "Task 5"),
  
  Tempo_data.table_Sec = c(T_DT1, T_DT2, T_DT3, T_DT4, T_DT5),
  Tempo_data.frame_Sec = c(T_DF1, T_DF2, T_DF3, T_DF4, T_DF5),
    Tempo_data.frame_Sec = c( T_DP1, T_DP2, T_DP3, T_DP4, T_DP5)
)

# Aggiungiamo una colonna per il fattore di velocizzazione (Speedup)
risultati_performance$Speedup_DT_vs_DF <- 
  round(risultati_performance$Tempo_data.frame_Sec / risultati_performance$Tempo_data.table_Sec, 1)

# Stampiamo la tabella finale
print(risultati_performance)

    Task Tempo_data.table_Sec Tempo_data.frame_Sec Tempo_data.frame_Sec.1
1 Task 1                0.025                0.084                  0.083
2 Task 2                0.038                0.245                  0.069
3 Task 3                0.030                0.121                  0.059
4 Task 4                5.530                5.824                  5.148
5 Task 5                0.024                0.062                  0.055
  Speedup_DT_vs_DF
1              3.4
2              6.4
3              4.0
4              1.1
5              2.6
