## Setup

In [1]:
source("2022_PSVD_metabolomics/2022_PSVD_metabolomics/R/00_setup.R")

In [None]:
p_load("dplyr", "ggplot2", "mice", "janitor", "POMA", "SummarizedExperiment", "DESeq2", install = FALSE)

In [27]:
lipid_raw <- read.csv("data/lipid.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")
facid_raw <- read.csv("data/facid.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")
metab_raw <- read.csv("data/metab.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")

In [28]:
lipid_map <- read.csv("data/lipid_map.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")
facid_map <- read.csv("data/facid_map.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")
metab_map <- read.csv("data/metab_map.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")

In [29]:
lipid_map_fix <- lipid_map %>% dplyr::select(c(name_original, name_corrected, HMDB)) %>% mutate(HMDB = if_else(stringr::str_starts(HMDB, "HMDB"), HMDB, name_corrected)) %>% dplyr::rename()
facid_map_fix <- facid_map %>% dplyr::select(c(name_original, name_corrected, HMDB)) %>% mutate(HMDB = if_else(stringr::str_starts(HMDB, "HMDB"), HMDB, name_corrected))
metab_map_fix <- metab_map %>% dplyr::select(c(name_original, name_corrected, HMDB)) %>% mutate(HMDB = if_else(stringr::str_starts(HMDB, "HMDB"), HMDB, name_corrected))

In [31]:
metadata <- facid_raw[, 3:7]
names(metadata) <- make_clean_names(names(metadata))
metadata <- metadata %>% dplyr::rename("sample_id" = "sample_identification")

In [33]:
indices <- match(colnames(lipid_raw), lipid_map_fix$name_original)
colnames(lipid_raw) <- ifelse(is.na(indices), colnames(lipid_raw), lipid_map_fix$HMDB[indices])

indices <- match(colnames(facid_raw), facid_map_fix$name_original)
colnames(facid_raw) <- ifelse(is.na(indices), colnames(facid_raw), facid_map_fix$HMDB[indices])

indices <- match(colnames(metab_raw), metab_map_fix$name_original)
colnames(metab_raw) <- ifelse(is.na(indices), colnames(metab_raw), metab_map_fix$HMDB[indices])

## Data cleaning

In [35]:
missing_threshold <- 10

In [None]:
lipid_raw_na <- lipid_raw %>% purrr::discard(~sum(is.na(.x))/length(.x)* 100 >= missing_threshold)
paste0("Columns with >", missing_threshold, "% missing values in lipid_raw: ", ncol(lipid_raw) - ncol(lipid_raw_na), ". New number of columns: ", ncol(lipid_raw_na), ". Percentage of outliers: ", round((ncol(lipid_raw) - ncol(lipid_raw_na)) * 100/ncol(lipid_raw), 2), "%")

facid_raw_na <- facid_raw %>% purrr::discard(~sum(is.na(.x))/length(.x)* 100 >= missing_threshold)
paste0("Columns with >", missing_threshold, "% missing values in facid_raw: ", ncol(facid_raw) - ncol(facid_raw_na), ". New number of columns: ", ncol(facid_raw_na), ". Percentage of outliers: ", round((ncol(facid_raw) - ncol(facid_raw_na)) * 100/ncol(facid_raw), 2), "%")

metab_raw_na <- metab_raw %>% purrr::discard(~sum(is.na(.x))/length(.x)* 100 >= missing_threshold)
paste0("Columns with >", missing_threshold, "% missing values in metab_raw: ", ncol(metab_raw) - ncol(metab_raw_na), ". New number of columns: ", ncol(metab_raw_na), ". Percentage of outliers: ", round((ncol(metab_raw) - ncol(metab_raw_na)) * 100/ncol(metab_raw), 2), "%")

In [39]:
lipid_raw_na <- remove_constant(lipid_raw_na)
facid_raw_na <- remove_constant(facid_raw_na)
metab_raw_na <- remove_constant(metab_raw_na)

In [40]:
names(lipid_raw_na)[1:6] <- make_clean_names(names(lipid_raw_na)[1:6])
names(facid_raw_na)[1:6] <- make_clean_names(names(facid_raw_na)[1:6])
names(metab_raw_na)[1:4] <- make_clean_names(names(metab_raw_na)[1:4])

In [41]:
lipid_raw_na <- lipid_raw_na %>% select(-sample_id, -sample_description, -group_123, -sex, -age)
facid_raw_na <- facid_raw_na %>% select(-sample_code, -sample_description, -group_123, -sex, -age)
metab_raw_na <- metab_raw_na %>% select(-group_123, -sex, -age)

In [42]:
lipid_raw_na <- lipid_raw_na %>% dplyr::rename("sample_id" = "label")
facid_raw_na <- facid_raw_na %>% dplyr::rename("sample_id" = "sample_identification")
metab_raw_na <- metab_raw_na %>% dplyr::rename("sample_id" = "sample_identification") 

## Imputation, normalization, scaling and outliers

In [43]:
lipid_se_raw <- PomaSummarizedExperiment(target = metadata, features = lipid_raw_na[2:ncol(lipid_raw_na)])
facid_se_raw <- PomaSummarizedExperiment(target = metadata, features = facid_raw_na[2:ncol(facid_raw_na)])
metab_se_raw <- PomaSummarizedExperiment(target = metadata, features = metab_raw_na[2:ncol(metab_raw_na)])

In [None]:
lipid_se <- PomaImpute(lipid_se_raw, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(coef = 3)
facid_se <- PomaImpute(facid_se_raw, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(coef = 3)
metab_se <- PomaImpute(metab_se_raw, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(coef = 3)

lipid_se_o <- PomaImpute(lipid_se_raw, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto")
facid_se_o <- PomaImpute(facid_se_raw, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") 
metab_se_o <- PomaImpute(metab_se_raw, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto")

lipid_se <- as.data.frame(t(assay(lipid_se))) %>% tibble::rownames_to_column("sample_id")
facid_se <- as.data.frame(t(assay(facid_se))) %>% tibble::rownames_to_column("sample_id")
metab_se <- as.data.frame(t(assay(metab_se))) %>% tibble::rownames_to_column("sample_id")


lipid_se_o <- as.data.frame(t(assay(lipid_se_o))) %>% tibble::rownames_to_column("sample_id")
facid_se_o <- as.data.frame(t(assay(facid_se_o))) %>% tibble::rownames_to_column("sample_id")
metab_se_o <- as.data.frame(t(assay(metab_se_o))) %>% tibble::rownames_to_column("sample_id")

In [45]:
lipid_se <- lipid_se %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
lipid_se <- t(scale(t(lipid_se)))
lipid_se <- lipid_se %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

facid_se <- facid_se %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
facid_se <- t(scale(t(facid_se)))
facid_se <- facid_se %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

metab_se <- metab_se %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
metab_se <- t(scale(t(metab_se)))
metab_se <- metab_se %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

lipid_se_o <- lipid_se_o %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
lipid_se_o <- t(scale(t(lipid_se_o)))
lipid_se_o <- lipid_se_o %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

facid_se_o <- facid_se_o %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
facid_se_o <- t(scale(t(facid_se_o)))
facid_se_o <- facid_se_o %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

metab_se_o <- metab_se_o %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
metab_se_o <- t(scale(t(metab_se_o)))
metab_se_o <- metab_se_o %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

## Integration

In [49]:
metabolomics_merged <- merge(lipid_se, facid_se, by="sample_id")
metabolomics_merged <- merge(metabolomics_merged, metab_se, by = "sample_id")

metabolomics_merged_o <- merge(lipid_se_o, facid_se_o, by="sample_id")
metabolomics_merged_o <- merge(metabolomics_merged_o, metab_se_o, by = "sample_id")

In [None]:
metabolomics_merged %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("sample_id") %>% distinct(sample_id) #%>% filter(sample_id %in%)

In [51]:
metabolites_mapped <- rbind(facid_map, lipid_map, metab_map)

In [None]:
matching_hmdb <- intersect(metabolites_mapped$HMDB, colnames(metabolomics_merged))
name_mapping <- setNames(metabolites_mapped$name_corrected[metabolites_mapped$HMDB %in% matching_hmdb], matching_hmdb)
colnames(metabolomics_merged) <- ifelse(colnames(metabolomics_merged) %in% names(name_mapping), name_mapping[colnames(metabolomics_merged)], colnames(metabolomics_merged))
print(metabolomics_merged)


In [26]:
write.csv(metabolomics_merged, "outputs/01_metabolomics_merged.csv")
write.csv(metabolomics_merged_o, "outputs/01_metabolomics_merged_outliers.csv")
write.csv(metabolites_mapped, "outputs/01_metabolites_mapped.csv")

In [32]:
write.csv(lipid_raw, "outputs/lipid_hmdb.csv", quote = FALSE)
write.csv(facid_raw, "outputs/facid_hmdb.csv", quote = FALSE)
write.csv(metab_raw, "outputs/metab_hmdb.csv", quote = FALSE)

In [38]:
write.csv(metadata, "outputs/01_metadata.csv")

In [48]:
write.csv(metab_se_o, "outputs/metab_se_o.csv")