## Setup

In [1]:
source("/research/lab_reiberger/2022_PSVD_metabolomics/2022_PSVD_metabolomics/R/00_setup.R")

In [25]:
p_load("dplyr", "ggplot2", "mice", "janitor", "POMA", "SummarizedExperiment", "DESeq2", install = FALSE)

In [3]:
lipid_raw <- read.csv("data/lipid.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")
facid_raw <- read.csv("data/facid.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")
metab_raw <- read.csv("data/metab.csv", fileEncoding="windows-1252", check.names = FALSE, na.strings="NA")

In [61]:
facid_raw

RunID,Sample Code,Sample Identification,Sample Description,Group_123,Sex,Age,Specimen,Unit,Formic acid (C1:0),⋯,Linolenic acid (C18:3),Nonadecanoic acid (C19:0),isononadecanoic acid (17-Me-18:0),Arachidic (C20:0),Arachidonic acid (20:4),Behenic acid (C22:0),Erucic acid (C22:1),"cis-4,7,10,13,16,19-Docosahexaenoic acid (C22:6)",Lignoceric acid (C24:0),Nervonic acid (C24:1)
<chr>,<int>,<chr>,<chr>,<int>,<int>,<dbl>,<chr>,<chr>,<lgl>,⋯,<dbl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R100323,113402,2341,Cirrhosis,1,1,44.7,Human,µM,,⋯,1.59,,,,1.91,,0.23,0.26,,1.63
R100323,113403,2344,Cirrhosis,1,1,40.8,Human,µM,,⋯,2.5,,,,1.94,,0.2,0.39,2.32,1.64
R100323,113404,2358,PSVD,2,2,34.3,Human,µM,,⋯,2.17,,,,1.88,,0.12,0.33,,1.63
R100323,113405,2360,Cirrhosis,1,2,48.5,Human,µM,,⋯,1.67,,,2.27,1.99,,0.08,0.33,2.27,1.63
R100323,113406,2390,PSVD,2,2,40.3,Human,µM,,⋯,1.74,,,2.29,1.87,,0.15,0.33,2.28,1.65
R100323,113407,2427,PSVD,2,1,57.8,Human,µM,,⋯,1.62,,,2.3,1.97,,0.19,0.35,,1.65
R100323,113408,2432,PSVD,2,2,46.0,Human,µM,,⋯,1.73,,,,1.91,,0.07,0.3,,1.63
R100323,113409,2486,Cirrhosis,1,1,53.7,Human,µM,,⋯,1.76,,,2.3,1.85,,0.06,0.29,,1.63
R100323,113410,2536,Cirrhosis,1,1,64.9,Human,µM,,⋯,1.65,,,2.3,1.87,,0.04,0.32,,1.63
R100323,113411,2578,Cirrhosis,1,1,48.2,Human,µM,,⋯,1.59,,,2.26,1.86,,0.04,0.28,,1.64


In [5]:
metadata <- facid_raw[, 3:7]
names(metadata) <- make_clean_names(names(metadata))
metadata <- metadata %>% dplyr::rename("sample_id" = "sample_identification")

## Data cleaning

In [10]:
missing_threshold <- 5

In [11]:
lipid_raw_na <- lipid_raw %>% purrr::discard(~sum(is.na(.x))/length(.x)* 100 >= missing_threshold)
paste0("Columns with >", missing_threshold, "% missing values in lipid_raw: ", ncol(lipid_raw) - ncol(lipid_raw_na), ". New number of columns: ", ncol(lipid_raw_na), ". Percentage of outliers: ", round((ncol(lipid_raw) - ncol(lipid_raw_na)) * 100/ncol(lipid_raw), 2), "%")

facid_raw_na <- facid_raw %>% purrr::discard(~sum(is.na(.x))/length(.x)* 100 >= missing_threshold)
paste0("Columns with >", missing_threshold, "% missing values in facid_raw: ", ncol(facid_raw) - ncol(facid_raw_na), ". New number of columns: ", ncol(facid_raw_na), ". Percentage of outliers: ", round((ncol(facid_raw) - ncol(facid_raw_na)) * 100/ncol(facid_raw), 2), "%")

metab_raw_na <- metab_raw %>% purrr::discard(~sum(is.na(.x))/length(.x)* 100 >= missing_threshold)
paste0("Columns with >", missing_threshold, "% missing values in metab_raw: ", ncol(metab_raw) - ncol(metab_raw_na), ". New number of columns: ", ncol(metab_raw_na), ". Percentage of outliers: ", round((ncol(metab_raw) - ncol(metab_raw_na)) * 100/ncol(metab_raw), 2), "%")

In [12]:
lipid_raw_na <- remove_constant(lipid_raw_na)
facid_raw_na <- remove_constant(facid_raw_na)
metab_raw_na <- remove_constant(metab_raw_na)

In [13]:
names(lipid_raw_na) <- make_clean_names(names(lipid_raw_na))
names(facid_raw_na) <- make_clean_names(names(facid_raw_na))
names(metab_raw_na) <- make_clean_names(names(metab_raw_na))

In [14]:
lipid_raw_na <- lipid_raw_na %>% select(-sample_id, -sample_description, -group_123, -sex, -age)
facid_raw_na <- facid_raw_na %>% select(-sample_code, -sample_description, -group_123, -sex, -age)
metab_raw_na <- metab_raw_na %>% select(-group_123, -sex, -age)

In [15]:
lipid_raw_na <- lipid_raw_na %>% dplyr::rename("sample_id" = "label")
facid_raw_na <- facid_raw_na %>% dplyr::rename("sample_id" = "sample_identification")
metab_raw_na <- metab_raw_na %>% dplyr::rename("sample_id" = "sample_identification") 

## Imputation, normalization, scaling and outliers

In [52]:
lipid_se <- PomaSummarizedExperiment(target = metadata, features = lipid_raw_na[2:ncol(lipid_raw_na)])
facid_se <- PomaSummarizedExperiment(target = metadata, features = facid_raw_na[2:ncol(facid_raw_na)])
metab_se <- PomaSummarizedExperiment(target = metadata, features = metab_raw_na[2:ncol(metab_raw_na)])


In [53]:
lipid_se <- PomaImpute(lipid_se, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(coef = 3)
facid_se <- PomaImpute(facid_se, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(coef = 3)
metab_se <- PomaImpute(metab_se, ZerosAsNA = TRUE, cutoff = 20, method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(coef = 3)

lipid_se <- as.data.frame(t(assay(lipid_se))) %>% tibble::rownames_to_column("sample_id")
facid_se <- as.data.frame(t(assay(facid_se))) %>% tibble::rownames_to_column("sample_id")
metab_se <- as.data.frame(t(assay(metab_se))) %>% tibble::rownames_to_column("sample_id")

“NaNs produced”
“NaNs produced”
“NaNs produced”


In [54]:
lipid_se <- lipid_se %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
lipid_se <- t(scale(t(lipid_se)))
lipid_se <- lipid_se %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

facid_se <- facid_se %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
facid_se <- t(scale(t(facid_se)))
facid_se <- facid_se %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

metab_se <- metab_se %>% tibble::column_to_rownames("sample_id") %>% as.matrix()
metab_se <- t(scale(t(metab_se)))
metab_se <- metab_se %>% as.data.frame() %>% tibble::rownames_to_column("sample_id")

## Integration

In [55]:
metabolomics_merged <- merge(lipid_se, facid_se, by="sample_id")
metabolomics_merged <- merge(metabolomics_merged, metab_se, by = "sample_id")

In [37]:
write.csv(metabolomics_merged, "outputs/01_metabolomics_merged.csv")

In [38]:
write.csv(metadata, "outputs/01_metadata.csv")