## I. Module (Functions)

In [1]:
library(mongolite) 
library(jsonlite)
library(data.table)
library(dplyr) 
library(tidyr)
library(readr)
library(stringi)

# Customization R functions
not_all_na <- function(x) any(!is.na(x))

parseInput <- function(string){
  # split new line
  str <- strsplit(string, '\\s')
  # remove empty string and NA
  str <- stri_remove_empty(str[[1]], TRUE)
  # remove , or . or 'NA'
  str <- str[! str %in% c(',', '.', 'NA')]
  # remove trailing ,
  str <- gsub(',$', "", str)
  # remove leading and trailing ' or " of each word in string
  str <- gsub("^\\'|\\'$", "", str)
  str <- gsub('^\\"|\\"$', "", str)

  return (str)
}

chunk <- function(x, n){split(x, sort(rep_len(1:n, length(x))))}

# parseInput(string="'CPI516'  , 
# 'CPI517' 'CPI519' NA")
# parseInput(string=' "CPI516"  , 
# "CPI517" "CPI519" NA')

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'readr' was built under R version 3.6.3"

In [2]:

# PART 1. Read data from MongoDB
#-------------------------------------------------------------------------------
search_individuals <- function(db, vec){
   # Input
   #  db: MongoDB object
   #  vec: a vector of individualCode
   # Output:
   #  df
   # Usage:
   # df <- search_individuals(db, vec=c('CPI555', 'CPI515'))
   
   vec <- toJSON(vec)
   query = paste0('{"individualCode": {"$in": ', vec, '}}')
   df <- db$find(query)
   return (df)    
}


search_individuals_nin <- function(db, vec){
   # Input
   #  db: MongoDB object
   #  vec: a vector of individualCode
   # Output:
   #  df
   # Usage:
   # df <- search_individuals_nin(db, vec=c('CPI555', 'CPI515'))
   
   vec <- toJSON(vec)
   query = paste0('{"individualCode": {"$nin": ', vec, '}}')
   df <- db$find(query)
   return (df)    
}


search_control <- function(db, 
                           query='{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'){
   df <- db$find(query)
   return (df)    
}


# PART 2: Dataframe manipulation
#-------------------------------------------------------------------------------
transform_df <- function(df_one_study_code){
   # Input 
   #  df_one_study_code: a dataframe of only one study_code
   # Output: 
   #  new dataframe after transforming df_one_study_code
   # Note: meaning of study_code and individualCode is the same.
   # Usage:
   #  df <- search_individuals(db, vec=c('CPI203', 'CPI248', 'CPI515'))
   #  df_one_study_code <- df %>%
   #                          filter(individualCode == 'CPI515')
   # df_markers <- transform_df(df_one_study_code)
 
   if (dim(df_one_study_code)[1] == 0){ # Empty dataframe
      return(df_one_study_code)
   }
   
   study_code <- unique(df_one_study_code$individualCode)[1]
   
   list_Samples <- df_one_study_code$Samples
   df_Samples <- rbindlist(list_Samples, fill=TRUE)
   list_markers <- df_Samples$markers
   df_markers <- rbindlist(list_markers, fill=TRUE)
   df_markers$studyCode <- study_code
   
   # remove "confidence" and "interpretation" columns
   df_markers <- subset(df_markers, select = -c(confidence, interpretation))
   return (df_markers)
}


filter_transform_df <- function(study_code, df){
   # Input
   #  study_code: a study code. Eg. study_code = 'CPI515'
   #  df: a dataframe
   # Output:
   #  new df
   # Usage:
   # df <- search_individuals(db, vec=c('CPI203', 'CPI248', 'CPI515'))
   # df_filter <- filter_transform_df(study_code = 'CPI515', df = df)
   
   if (dim(df)[1] == 0){ 
      return(df)
   }
   df_filter <- df %>%
      filter(individualCode == study_code) %>%
      transform_df()
   return (df_filter)
}


concat_pivot_df <- function(study_codes, df){
   # Input
   #  study_codes: a vector. Eg. study_codes <- c('CPI248', 'CPI515')
   #  df: a dataframe
   # Output:
   #  new df
   # Usage:
   #   df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
   #   study_codes <- c('APO014', 'CPI248', 'CPI515')
   #   df_pivot <- concat_pivot_df(study_codes, df)
   
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_mean_df <- function(study_codes, df){
   # Input
   #  study_codes: a vector. Eg. study_codes <- c('CPI248', 'CPI515')
   #  df: a dataframe
   # Output:
   #  new df
   # Usage:
   #   df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
   #   study_codes <- c('APO014', 'CPI248', 'CPI515')
   #   df_agg <- concat_agg_mean_df(study_codes, df)
   
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_agg <- bind_rows(list_dfs) %>%
      group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
      summarize(value = mean(value))            
   return (df_agg)
}


concat_pivot_df <- function(study_codes, df){
   # Input
   #  study_codes: a vector. Eg. study_codes <- c('CPI248', 'CPI515')
   #  df: a dataframe
   # Output:
   #  new df
   # Usage:
   #   df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
   #   study_codes <- c('APO014', 'CPI248', 'CPI515')
   #   df_pivot <- concat_pivot_df(study_codes, df)
   
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_mean_pivot_df_studyCode <- function(study_codes, df){
   # Input
   #  study_codes: a vector. Eg. study_codes <- c('CPI248', 'CPI515')
   #  df: a dataframe
   # Output:
   #  new df
   # Usage:
   #   df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
   #   study_codes <- c('APO014', 'CPI248', 'CPI515')
   #   df_pivot <- concat_agg_mean_pivot_df_studyCode(study_codes, df)
   
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
      summarize(value = mean(value)) %>%  
      pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_mean_pivot_df_name <- function(study_codes, df){
   
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
      summarize(value = mean(value)) %>%  
      pivot_wider(names_from = name, values_from = value)    
   return (df_pivot)
}


# PART 3: Percentile calculation and normalization
#-------------------------------------------------------------------------------
# length of vector without NA
len <- function(x) { 
   # Input: 
   #  x: a vector
   # Output:
   #  length of x without NA
   # Usage:
   #  len(x = c(10, 20, NA, 99.99, NA))  -> Output: 3
   return (length(x[!is.na(x)])) 
   } 


# percentile by JCSMR
percentile_JCSMR <- function(vec){
   # Input: 
   #  vec: a vector
   # Output:
   #  JCSMR percentile of vec
   # Usage:
   #  vec = c(50.0, 80.0, 90.0, 119.0, 119.0, 120, NA)
   #  percentiles <- percentile_JCSMR(vec)
   
   percentiles <- rank(vec, na.last = 'keep', ties.method = c("average")) * (1. / (len(vec) + 1))
   return (percentiles)
}

# common percentile
percentile <- function(vec){
   # Input: 
   #  vec: a vector
   # Output:
   #  JCSMR percentile of vec
   # Usage:
   #  vec = c(50.0, 80.0, 90.0, 119.0, 119.0, 120, NA)
   #  percentiles <- percentile(vec)
   
   percentiles <- rank(vec, na.last = 'keep', ties.method = c("average")) * (1. / len(vec))
   return (percentiles)    
}


# Calculate percentile of a value w.r.t a vector
percentile_norm <- function(value, vec, method='JCSMR'){
   # Input: 
   #  value:  a value to calculate its percentile w.r.t vec
   #  vec: a vector of values
   #  method: 'JCSMR' or a normal way
   # Output:
   #  percentile of value
   # Usage:
   #  vec = c(50.0, 80.0, 90.0, 119.0, 119.0, 120, NA)
   #  per <- percentile_norm(39, vec, method='JCSMR') -> output=0
   #  per <- percentile_norm(90, vec, method='JCSMR') -> output=0.49
   #  per <- percentile_norm(120, vec, method='JCSMR') -> output=0.857
   #  per <- percentile_norm(121, vec, method='JCSMR') -> output=1
   
   # method == 'JCSMR' or a normal way
   if (method == 'JCSMR'){
      epsilon <- 1e-6
      maxVal <- max(vec, na.rm = TRUE)
      vec <- c(vec, maxVal + epsilon)
   }   
   return (ecdf(vec)(value))   
} 


# normalize percentile one column
norm_perc_one_col <- function(col_name, df_control, df_data){
   # Input: 
   #  col_name: A colunm name exists in both dataframes
   #  df_control: A dataframe of control
   #  df_data: A dataframe of treatment
   # Output:
   #  df_data in which col_name has been norm percentile
   # Usage:
   #  df_data <- norm_perc_one_col(col_name, df_control, df_data)
   
   df_data[[col_name]] <- unlist(lapply(df_data[[col_name]], percentile_norm, df_control[[col_name]]))
   return (df_data)
}

# normalize percentile of a df w.r.t other df
normalize_percentile <- function(df_control, df_data, col_not_cal_per="studyCode"){
   # Input: 
   #  df_control: A dataframe of control
   #  df_data: A dataframe of treatment
   #  col_not_cal_per: A colunm name to be ignore when calculating percentile
   # Output:
   #  data: a R list (a Python dict)
   #  Access to: 
   #     df_data: data$df
   #     columns (calculate percentile by itself): data$cols_cal_self_percentile
   # Usage:
   #  data <- normalize_percentile(df_control, df_data, col_not_cal_per)
   
   # columns existed both in df_control and df_data
   common_cols <- intersect(names(df_control), names(df_data))
   # Remove not related col 
   common_cols <- common_cols[common_cols != col_not_cal_per]
   
   # If df_control[col] has less than 2 real values, it cannot use for percentile_norm()
   cols_len_smaller_2 <- c()
   for (col in common_cols){
      if (len(df_control[[col]]) < 2){
         cols_len_smaller_2 <- c(cols_len_smaller_2, col)
      }
   }
   
   # Get columns and calculate normalized percentile (w.r.t control data) 
   selected_cols <- setdiff(common_cols, cols_len_smaller_2)
   for (col_name in selected_cols){
      df_data[[col_name]] <- unlist(lapply(df_data[[col_name]], percentile_norm, df_control[[col_name]]))
   }
   
   # columns exist in df_data ONLY or existed in both but it is cols_len_smaller_2 
   # -> Calculate percentile by itself
   cols_in_data_only <- setdiff(names(df_data), names(df_control))
   cols_cal_self_percentile <- c(cols_in_data_only, cols_len_smaller_2)    
   for (col_name in cols_cal_self_percentile){
      df_data[[col_name]] <- percentile_JCSMR(df_data[[col_name]])
   }
   
   return (list(df=df_data, cols_cal_self_percentile=cols_cal_self_percentile))  
}


# normalize percentile of a df by itself
self_normalize_percentile <- function(df_data, col_not_cal_per="studyCode"){
   # Input: 
   #  df_data: A dataframe of treatment
   #  col_not_cal_per: A colunm name to be ignore when calculating percentile
   # Output:
   #  data: a R list (a Python dict)
   #  Access to: 
   #     df_data: data$df
   #     columns (calculate percentile by itself): data$cols_cal_self_percentile
   # Usage:
   #  data <- normalize_percentile(df_control, df_data, col_not_cal_per)
   
   cols <- names(df_data)
   # Remove not related col 
   cols <- cols[cols != col_not_cal_per]
      
   for (col_name in cols){
      df_data[[col_name]] <- percentile_JCSMR(df_data[[col_name]])
   }
   
   return (list(df=df_data, cols_cal_self_percentile=cols))
}


get_heatmap_data <- function(db, df_control, df_data){
   # Input:
   #   db: MongoDB object
   #   df_control: dataframe of control group
   #   df_data: dataframe of treatment group
   # Ouput:
   #   data: a list with "df" and "cols_cal_self_percentile" keys
   # Usage example:
   # data <- get_heatmap_data(db, 
   #           df_control = search_individuals(db, vec = c('HBD056', 'HBD044', 'HBD039')),
   #           df_data = search_individuals(db, vec = c('CPI515', 'CPI545')))
   # df <- data$df
   # cols_cal_self_percentile <- data$cols_cal_self_percentile
   
   # Control data  
   study_codes_control <- unique(df_control$individualCode)
   df_control <- concat_agg_mean_pivot_df_name(study_codes_control, df_control)
   
   # Treatment data
   study_codes_data <- unique(df_data$individualCode)      
   df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)
   
   # Percentile
   data <- normalize_percentile(df_control, df_data, col_not_cal_per="studyCode")
   df <- transpose(data$df, keep.names="marker", make.names="studyCode", fill=NA, ignore.empty=FALSE)
   return (list(df = df, cols_cal_self_percentile = data$cols_cal_self_percentile))   
}


get_self_heatmap_data <- function(db, df_data){
   # Input:
   #   db: MongoDB object
   #   df_data: dataframe of treatment group
   # Ouput:
   #   data: a list with "df" and "cols_cal_self_percentile" keys
   # Usage example:
   #   data <- get_self_heatmap_data(db,                          
   #              df_data = search_individuals(db, vec = c('CPI515', 'CPI545')))
   #   df <- data$df
   #   cols_cal_self_percentile <- data$cols_cal_self_percentile
  
   study_codes_data <- unique(df_data$individualCode)      
   df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)
   
   # Percentile
   data <- self_normalize_percentile(df_data, col_not_cal_per="studyCode")
   df <- transpose(data$df, keep.names="marker", make.names="studyCode", fill=NA, ignore.empty=FALSE)
   return (list(df = df, cols_cal_self_percentile = data$cols_cal_self_percentile))   
}


get_heatmap_data_by_vecs <- function(db, vec_control, vec_data){
   # This function is very similar to get_heatmap_data() -> keep it for reference 
   # Input:
   #   db: MongoDB object
   #   vec_control: array of study codes in control group
   #   vec_data: array of study codes in data group
   # Ouput:
   #   data: a list with "df" and "cols_cal_self_percentile" keys
   # Usage example:
   # data <- get_heatmap_data(db, 
   #                          vec_control = c('HBD056', 'HBD044', 'HBD039', 'HBD040'), 
   #                          vec_data = c('CPI515', 'CPI545')) 
   # df <- data$df
   # cols_cal_self_percentile <- data$cols_cal_self_percentile
   
   # Control data  
   df_control <- search_individuals(db, vec_control)
   study_codes_control <- unique(df_control$individualCode)
   df_control <- concat_agg_mean_pivot_df_name(study_codes_control, df_control)
   
   # Real data
   df_data <- search_individuals(db, vec_data)
   study_codes_data <- unique(df_data$individualCode)      
   df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)
   
   # Percentile
   data <- normalize_percentile(df_control, df_data, col_not_cal_per="studyCode")
   df <- transpose(data$df, keep.names="marker", make.names="studyCode", fill=NA, ignore.empty=FALSE)
   return (list(df = df, cols_cal_self_percentile = data$cols_cal_self_percentile))   
}

### Connect to MongoDB

In [3]:
# Template: mongodb://[username:password@]host1[:port1][,host2[:port2],...[/[database][?options]]
# Eg. m <- mongo("mtcars", url = "mongodb://a_user_name:a_password@mongo.org:2021/test")
# Ref: https://jeroen.github.io/mongolite/connecting-to-mongodb.html
# Eg. Simple way
# db <- mongo(collection = "markers", 
#             db = "facs",
#             url = "mongodb://localhost:27017")

# Envirovment file (.env) example
# MONGODB_HOST="127.0.0.1"  
# MONGODB_PORT=27017
# MONGODB_USER="an username or an empty string"
# MONGODB_PASSWORD="a password or an empty string"
# MONGODB_DB_NAME='facs'
# MONGODB_COLLECTION_NAME='markers'

readRenviron(".env")  #  read Environment file

mongo_host <- Sys.getenv("MONGODB_HOST", "127.0.0.1")
mongo_port <- Sys.getenv("MONGODB_PORT", 27017)
mongo_db <- Sys.getenv("MONGODB_DB_NAME", "facs")
mongo_collection <- Sys.getenv("MONGODB_COLLECTION_NAME", "markers")
mongo_user <- Sys.getenv("MONGODB_USER", "")
mongo_password <- Sys.getenv("MONGODB_PASSWORD", "")

if (!stri_isempty(mongo_user) & !stri_isempty(mongo_password)){
    db <- mongo(url = paste("mongodb://", 
                            mongo_user, ":", mongo_password, "@", 
                            mongo_host, ":", toString(mongo_port), sep = ""),
                db = mongo_db,
                collection = mongo_collection)
} else {
    db <- mongo(url = paste("mongodb://", 
                            mongo_host, ":", toString(mongo_port), sep = ""),
                db = mongo_db,
                collection = mongo_collection)
}

mongo_user <- Sys.getenv("MONGODB_USER")
mongo_user
db

Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


<Mongo collection> 'markers' 
 $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
 $count(query = "{}") 
 $disconnect(gc = TRUE) 
 $distinct(key, query = "{}") 
 $drop() 
 $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
 $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
 $import(con, bson = FALSE) 
 $index(add = NULL, remove = NULL) 
 $info() 
 $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
 $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
 $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
 $remove(query, just_one = FALSE) 
 $rename(name, db = NULL) 
 $replace(query, update = "{}", upsert = FALSE) 
 $run(command = "{\"ping\": 1}", simplify = TRUE) 
 $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE

In [4]:
db$count('{}')

### Test get all runID

In [5]:
df_all <- db$find("{}")
dim(df_all)
run_id_alls <- unique(df_all$runId)
run_id_alls
length(run_id_alls)

### Test: get no data in MongoDB

In [6]:
df_test_1 <- search_individuals(db, vec=c('TTTTTT'))
dim(df_test_1)
df_test_1

In [7]:
# a <- character(0)
df_test_2 <- search_individuals(db, vec=character(0))
dim(df_test_2)
# df_test_2

## Get all control in DB

In [8]:
df_control <- search_control(db)
dim(df_control)

In [9]:
study_codes_control <- unique(df_control$individualCode) # Get study code from df getting from DB, 
study_codes_control
length(study_codes_control)
class(study_codes_control)

In [10]:
df_one_study_code_HBD040 <- df_control %>% filter(individualCode == "HBD040")
df_one_study_code_HBD040 <- transform_df(df_one_study_code_HBD040)
dim(df_one_study_code_HBD040)
head(df_one_study_code_HBD040, 2)
tail(df_one_study_code_HBD040, 2)
unique(df_one_study_code_HBD040$studyCode)

name,value,studyCode
B cells (%Lymphocytes/live),11.1,HBD040
Anergic B (%B),12.8,HBD040


name,value,studyCode
TEM (% CD8),22.3,HBD040
TEMRA (% CD8),42.4,HBD040


In [11]:
# vec_control_test <- df_control[["Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"]]
# Only HBD063 has value =0.9 at "Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"
# others are NA
vec_control_test <- df_control[["Bm (%B)"]]
vec_control_test

NULL

In [12]:
write_csv(df_control, "df_control.csv")

ERROR: Error: Flat files can't store the list column `Samples`


## Get all treatment data in DB

In [None]:
df_data <- search_individuals_nin(db, study_codes_control)
dim(df_data)
# head(df_data, 2)

In [None]:
study_codes_data <- unique(df_data$individualCode)  # 'GEM177',..., 'CPI018',...
study_codes_data <- study_codes_data[! study_codes_data %in% c('AMCS20001A', 'AMCS20006A', 'AMCS21027A', 'AMCS20002A')]
                # study_codes = c("CPI515", "CPI464", "APO180", "GEM177", "NotExisted")
sort(study_codes_data)
length(study_codes_data)

In [None]:
# At "Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"
# has many values 0.33, 0.97, 0.41, 0.31, 2.15, 1.21, 0.64, 0.7, 1.09, 0.93, 0.85, 1.82, 0.93, 6.35, 0.68

In [None]:
write_csv(df_data, "df_data.csv")

# PART II. Test each function

### search_individuals()

In [None]:
df <- search_individuals(db, vec=c('CPI555', 'CPI515'))
dim(df)

#### search_individuals_nin

In [None]:
df <- search_individuals_nin(db, vec=c('CPI555', 'CPI515'))
dim(df)

### search_control()

In [None]:
df_control <- search_control(db, query='{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}')
dim(df_control)

### transform_df()

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
df_one_study_code <- df %>% filter(individualCode == 'CPI515')

df_markers <- transform_df(df_one_study_code)
dim(df_markers)
head(df_markers, 2)
tail(df_markers, 2)

In [None]:
df_one_study_code <- df %>% filter(individualCode == 'APO014')

df_markers <- transform_df(df_one_study_code)
dim(df_markers)
head(df_markers, 2)
tail(df_markers, 2)

### filter_transform_df()

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))

df_filter <- filter_transform_df(study_code = 'CPI515', df = df)
dim(df_filter)
head(df_filter, 2)
tail(df_filter, 2)

### concat_pivot_df

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
study_codes <- c('APO014', 'CPI248', 'CPI515')

df_pivot <- concat_pivot_df(study_codes, df)

dim(df_pivot)
head(df_pivot, 2)
tail(df_pivot, 2)

### concat_agg_mean_df

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
study_codes <- c('APO014', 'CPI248', 'CPI515')

df_agg <- concat_agg_mean_df(study_codes, df)

dim(df_agg)
head(df_agg, 2)
tail(df_agg, 2)

### concat_pivot_df()

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
study_codes <- c('APO014', 'CPI248', 'CPI515')

df_pivot <- concat_pivot_df(study_codes, df)

dim(df_pivot)
head(df_pivot, 2)
tail(df_pivot, 2)

### concat_agg_mean_pivot_df_studyCode()

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
study_codes <- c('APO014', 'CPI248', 'CPI515')

df_pivot <- concat_agg_mean_pivot_df_studyCode(study_codes, df)

dim(df_pivot)
head(df_pivot, 2)
tail(df_pivot, 2)

### concat_agg_mean_pivot_df_name()

In [None]:
df <- search_individuals(db, vec=c('APO014', 'CPI203', 'CPI248', 'CPI515'))
study_codes <- c('APO014', 'CPI248', 'CPI515')

df_pivot <- concat_agg_mean_pivot_df_name(study_codes, df)

dim(df_pivot)
head(df_pivot, 2)
tail(df_pivot, 2)

### len()

In [None]:
# len <- function(x) { return (length(x[!is.na(x)])) } 
len(c(10, 20, NA, 99.99, NA))

```
Eg. df = pd.DataFrame({'Values': [119, np.nan, 80, 50, 120, 90, 119]}).sort_values('Values')
        Values	Rank	Rank_Percentile_Pandas	Rank_Percentile_Manual	Rank_Percentile_JCSMR
    0	50.0	1.0	            0.166667	            0.166667	        0.142857
    1	80.0	2.0	            0.333333	            0.333333	        0.285714
    2	90.0	3.0	            0.500000	            0.500000	        0.428571
    3	119.0	4.5	            0.750000	            0.750000	        0.642857
    4	119.0	4.5	            0.750000	            0.750000	        0.642857
    5	120.0	6.0	            1.000000	            1.000000	        0.857143
    6	NaN	    NaN	            NaN	                    NaN	                NaN
```

### percentile_JCSMR()

In [None]:
vec = c(50.0, 80.0, 90.0, 119.0, 119.0, 120, NA)

percentiles <- percentile_JCSMR(vec)
percentiles

### percentile()

In [None]:
vec = c(50.0, 80.0, 90.0, 119.0, 119.0, 120, NA)

percentiles <- percentile(vec)
percentiles

### percentile_norm()

In [None]:
vec = c(50.0, 80.0, 90.0, 119.0, 119.0, 120, NA)

per <- percentile_norm(value = 39, vec = vec, method='JCSMR')
per

In [None]:
per <- percentile_norm(value = 90, vec = vec, method='JCSMR')
per

In [None]:
per <- percentile_norm(value = 120, vec = vec, method='JCSMR')
per

In [None]:
per <- percentile_norm(value = 121, vec = vec, method='JCSMR')
per

### norm_perc_one_col(): Normalize percentile at one column only

In [None]:
col_name <- "Treg (%CD4)"

# control
df_control <- search_individuals(db, vec = c('HBD056', 'HBD044', 'HBD039', 'HBD040'))
study_codes_control <- unique(df_control$individualCode)
df_control <- concat_agg_mean_pivot_df_name(study_codes_control, df_control)

# treatment
df_data <- search_individuals(db, vec = c('CPI515', 'CPI545'))
   study_codes_data <- unique(df_data$individualCode)      
   df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)

print('At control')
df_control[, col_name]
print('At treatment: Before calling norm_perc_one_col')
df_data[, col_name]  # output is a dataframe
df_data[[col_name]]  # output is a vector

In [None]:
df_data <- norm_perc_one_col(col_name, df_control, df_data)
print('At control')
df_control[, col_name]
print('At treatment: AFTER calling norm_perc_one_col')
df_data[, col_name]

### normalize_percentile(): Normalize percentile at all columns execept one

In [None]:
col_not_cal_per <- "studyCode"  # column not calculate percentile

# control
df_control <- search_individuals(db, vec = c('HBD056', 'HBD044', 'HBD039', 'HBD040'))
study_codes_control <- unique(df_control$individualCode)
df_control <- concat_agg_mean_pivot_df_name(study_codes_control, df_control)

# treatment
df_data <- search_individuals(db, vec = c('CPI515', 'CPI545'))
   study_codes_data <- unique(df_data$individualCode)      
   df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)

print('At control')
head(df_control, 3)
print('At treatment: Before calling normalize_percentile')
head(df_data, 3)

In [None]:
data <- normalize_percentile(df_control, df_data, col_not_cal_per)

print('At treatment: AFTER calling normalize_percentile')
head(data$df, 3)

print('List column names to be calculated percentile by itself')
data$cols_cal_self_percentile

### self_normalize_percentile()

In [None]:
col_not_cal_per <- "studyCode"  # column not calculate percentile

# treatment
df_data <- search_individuals(db, vec = c('CPI515', 'CPI545'))
study_codes_data <- unique(df_data$individualCode)      
df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)

print('At treatment: Before calling self_normalize_percentile()')
head(df_data, 3)

In [None]:
df_data <- self_normalize_percentile(df_data, col_not_cal_per="studyCode")
print('At treatment: AFTER calling self_normalize_percentile()')
head(df_data, 3)

### get_heatmap_data()

In [None]:
data <- get_heatmap_data(db, 
                         df_control = search_individuals(db, vec = c('HBD056', 'HBD044', 'HBD039', 'HBD040')), 
                         df_data = search_individuals(db, vec = c('CPI515', 'CPI545')))
print('After calling get_heatmap_data')
head(data$df, 3)
print('Columns')
data$cols_cal_self_percentile

### get_self_heatmap_data()

In [None]:
data <- get_self_heatmap_data(db,                          
                         df_data = search_individuals(db, vec = c('CPI515', 'CPI545')))
print('After calling get_heatmap_data')
head(data$df, 3)
print('Columns')
data$cols_cal_self_percentile