## Manipulation functions

In [1]:
library(mongolite) 
library(jsonlite)
library(data.table)
library(dplyr) 
library(tidyr)
library(readr)


# PART 1. Read data from MongoDB
#-------------------------------------------------------------------------------
search_individuals <- function(db, vec){
   vec <- toJSON(vec)
   query = paste0('{"individualCode": {"$in": ', vec, '}}')
   df <- db$find(query)
   return (df)    
}


search_individuals_nin <- function(db, vec){
   vec <- toJSON(vec)
   query = paste0('{"individualCode": {"$nin": ', vec, '}}')
   df <- db$find(query)
   return (df)    
}


search_control <- function(db, 
                           query='{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'){
   df <- db$find(query)
   return (df)    
}


# PART 2: Dataframe manipulation
#-------------------------------------------------------------------------------
transform_df <- function(df_one_study_code){
   # Input df_one_study_code: a dataframe of only one study_code
   # Output: new dataframe after transforming df_one_study_code
   # Note: meaning of study_code and individualCode is the same.
   
   if (dim(df_one_study_code)[1] == 0){ # Empty dataframe
      return(df_one_study_code)
   }
   
   study_code <- unique(df_one_study_code$individualCode)[1]
   
   list_Samples <- df_one_study_code$Samples
   df_Samples <- rbindlist(list_Samples, fill=TRUE)
   list_markers <- df_Samples$markers
   df_markers <- rbindlist(list_markers, fill=TRUE)
   df_markers$studyCode <- study_code
   
   # remove "confidence" and "interpretation" columns
   df_markers <- subset(df_markers, select = -c(confidence, interpretation))
   return (df_markers)
}


filter_transform_df <- function(study_code, df){
   
   if (dim(df)[1] == 0){ 
      return(df)
   }
   df_filter <- df %>%
      filter(individualCode == study_code) %>%
      transform_df()
   return (df_filter)
}


concat_pivot_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_mean_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_agg <- bind_rows(list_dfs) %>%
      group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
      summarize(value = mean(value))            
   return (df_agg)
}


concat_pivot_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_mean_pivot_df_studyCode <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
      summarize(value = mean(value)) %>%  
      pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_mean_pivot_df_name <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_transform_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
      group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
      summarize(value = mean(value)) %>%  
      pivot_wider(names_from = name, values_from = value)    
   return (df_pivot)
}


# PART 3: Percentile calculation and normalization
#-------------------------------------------------------------------------------
# length of vector without NA
len <- function(x) { return (length(x[!is.na(x)])) } 


# percentile by JCSMR
percentile_JCSMR <- function(vec){
   percentiles <- rank(vec, na.last = 'keep', ties.method = c("average")) * (1. / (len(vec) + 1))
   return (percentiles)
}

# common percentile
percentile <- function(vec){
   percentiles <- rank(vec, na.last = 'keep', ties.method = c("average")) * (1. / len(vec))
   return (percentiles)    
}


# Calculate percentile of a value w.r.t a vector
percentile_norm <- function(value, vec, method='JCSMR'){
   # method == 'JCSMR' or a normal way
   if (method == 'JCSMR'){
       epsilon <- 1e-6
       maxVal <- max(vec, na.rm = TRUE)
       vec <- c(vec, maxVal + epsilon)
   }   
   return (ecdf(vec)(value))   
} 


# normalize percentile one column
norm_perc_one_col <- function(col_name, df_control, df_data){
   df_data[[col_name]] <- unlist(lapply(df_data[, col_name], percentile_norm, df_control[, col_name]))
   return (df_data)
}


# normalize percentile of a df w.r.t other df
normalize_percentile <- function(df_control, df_data, col_not_cal_per="marker"){
   
   # columns existed both in df_control and df_data
   common_cols <- intersect(names(df_control), names(df_data))
   # Remove not related col 
   common_cols <- common_cols[common_cols != col_not_cal_per]
    
   # If df_control[col] has less than 2 real values, it cannot use for percentile_norm()
   cols_len_smaller_2 <- c()
   for (col in common_cols){
        if (len(df_control[[col]]) < 2){
            cols_len_smaller_2 <- c(cols_len_smaller_2, col)
        }
    }
   
   # Get column to calculate normalized percentile (w.r.t control data) 
   selected_cols <- setdiff(common_cols, cols_len_smaller_2)
   for (col_name in selected_cols){
      df_data[[col_name]] <- unlist(lapply(df_data[[col_name]], percentile_norm, df_control[[col_name]]))
      ## df_data[[col_name]] <- unlist(lapply(df_data[, col_name], percentile_norm, df_control[, col_name]))
   }
   
   # columns existed df_data ONLY or it exsited in both but it is cols_len_smaller_2 
   # -> Calculate percentile by itself
   cols_in_data_only <- setdiff(names(df_data), names(df_control))
   cols_cal_self_percentile <- c(cols_in_data_only, cols_len_smaller_2)    
   for (col_name in cols_cal_self_percentile){
      # df_data[[col_name]] <- per_JCSMR(df_data[, col_name])
      df_data[[col_name]] <- percentile(df_data[, col_name])
   }
   
   return (list(df=df_data, cols_cal_self_percentile=cols_cal_self_percentile))  
}

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'readr' was built under R version 3.6.3"

### Connect to MongoDB

In [2]:
db <- mongo(collection = "markers", 
            db = "facs",
            url = "mongodb://localhost:27017"
            )
db
# mongo<-mongolite::mongo(collection = "Sample", db = "Test", url = 
#                           "mongodb://User:123@Wyyuyu:13333/ty2_U",verbose = TRUE)

Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


<Mongo collection> 'markers' 
 $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE) 
 $count(query = "{}") 
 $disconnect(gc = TRUE) 
 $distinct(key, query = "{}") 
 $drop() 
 $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}") 
 $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000) 
 $import(con, bson = FALSE) 
 $index(add = NULL, remove = NULL) 
 $info() 
 $insert(data, pagesize = 1000, stop_on_error = TRUE, ...) 
 $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0) 
 $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL) 
 $remove(query, just_one = FALSE) 
 $rename(name, db = NULL) 
 $replace(query, update = "{}", upsert = FALSE) 
 $run(command = "{\"ping\": 1}", simplify = TRUE) 
 $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE

In [3]:
db$count('{}')

## df control

In [4]:
df_control <- search_control(db)
dim(df_control)

In [5]:
study_codes_control <- unique(df_control$individualCode) # Get study code from df getting from DB, 
study_codes_control
length(study_codes_control)
class(study_codes_control)

In [6]:
df_one_study_code_HBD040 <- df_control %>% filter(individualCode == "HBD040")
df_one_study_code_HBD040 <- transform_df(df_one_study_code_HBD040)
dim(df_one_study_code_HBD040)
head(df_one_study_code_HBD040, 2)
tail(df_one_study_code_HBD040, 2)
unique(df_one_study_code_HBD040$studyCode)

name,value,studyCode
B cells (%Lymphocytes/live),11.1,HBD040
Anergic B (%B),12.8,HBD040


name,value,studyCode
TEM (% CD8),22.3,HBD040
TEMRA (% CD8),42.4,HBD040


In [7]:
df_control <- concat_agg_mean_pivot_df_name(study_codes_control, df_control)
dim(df_control)
head(df_control, 2)
tail(df_control, 2)

`summarise()` has grouped output by 'studyCode'. You can override using the `.groups` argument.


studyCode,Activated CD4+ T cells(% CD4),Activated CD8+ T cells (% CD8),Anergic B (%B),B cells (%Lymphocytes/live),Bm (%B),Bm (%Lymphocytes/live),CD16+ mDCs (%APC),CD16neg mDCs (%APC),CD3 T cells (% Lymphocytes/live),...,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/MZ | Freq․ of Lymphocytes,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB | Freq․ of Lymphocytes,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB/IgA+ smB | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB/IgG+ smB | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/PBs | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-a | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-b | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-c | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-d | Freq․ of CD19+
HBD001,0.8175,0.5425,7.035,13.725,5.79,0.78,8.3975,6.9425,62.875,...,,,,,,,,,,
HBD003,,,,,,,,,,...,,,,,,,,,,


studyCode,Activated CD4+ T cells(% CD4),Activated CD8+ T cells (% CD8),Anergic B (%B),B cells (%Lymphocytes/live),Bm (%B),Bm (%Lymphocytes/live),CD16+ mDCs (%APC),CD16neg mDCs (%APC),CD3 T cells (% Lymphocytes/live),...,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/MZ | Freq․ of Lymphocytes,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB | Freq․ of Lymphocytes,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB/IgA+ smB | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Bm/smB/IgG+ smB | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/PBs | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-a | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-b | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-c | Freq․ of CD19+,Single Cells/Single Cells/live/Lymphocytes/CD19+/Trans-d | Freq․ of CD19+
HBD177,,,,,,,,,,...,,,,,,,,,,
HBD63,,,,,,,,,,...,0.23,15.2,1.3,6.34,7.05,2.5,4.81,4.16,4.28,6.81


In [8]:
# length(names(df_control))
# names(df_control)
# vec_control_test <- df_control[["Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"]]
# Only HBD063 has value =0.9 at "Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"
# others are NA


vec_control_test <- df_control[["Bm (%B)"]]
vec_control_test

In [9]:
write_csv(df_control, "df_control_R.csv")

## df_data

In [10]:
df_data <- search_individuals_nin(db, study_codes_control)
dim(df_data)
# head(df_data, 2)


In [11]:
study_codes_data <- unique(df_data$individualCode)  # 'GEM177',..., 'CPI018',...
study_codes_data <- study_codes_data[! study_codes_data %in% c('AMCS20001A', 'AMCS20006A', 'AMCS21027A', 'AMCS20002A')]
                # study_codes = c("CPI515", "CPI464", "APO180", "GEM177", "NotExisted")
study_codes_data
length(study_codes_data)

In [12]:
df_data <- concat_agg_mean_pivot_df_name(study_codes_data, df_data)
dim(df_data)
head(df_data, 3)

`summarise()` has grouped output by 'studyCode'. You can override using the `.groups` argument.


studyCode,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 2 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/Nk 3 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 4 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/Classical monocytes | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC/CD16- mDC | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC/CD16+ mDC | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/pDC | Freq․ of APC,...,Th1-17 (CD4),Th1 (%CD4),Th17 (%CD4),Th2 (%CD4),Th2 (CD4),Trans-a (%B),Trans-b (%B),Trans-c (%B),Trans-d (%B),Treg (%CD4)
APO014,0.33,13.8,0.69,5.83,23.9,18.6,15.8,2.77,4.42,...,,,,,,,,,,
APO042,0.97,13.4,0.95,0.5,53.2,26.3,6.66,19.6,2.73,...,,,,,,,,,,
APO180,0.41,4.3,0.76,0.37,60.5,22.9,6.23,16.6,2.97,...,4.345556,6.8995,4.175,10.42818,8.813333,5.827778,5.235556,4.334444,6.248889,4.9415


In [13]:
# At "Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"
# has many values 0.33, 0.97, 0.41, 0.31, 2.15, 1.21, 0.64, 0.7, 1.09, 0.93, 0.85, 1.82, 0.93, 6.35, 0.68

In [14]:
write_csv(df_data, "df_data_R.csv")

In [15]:
data <- normalize_percentile(df_control, df_data, col_not_cal_per="studyCode")

In [16]:
head(data$df,3)
dim(data$df)
class(data$df)

studyCode,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 2 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/Nk 3 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 4 | Freq․ of LC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/Classical monocytes | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC/CD16- mDC | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC/CD16+ mDC | Freq․ of APC,Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/pDC | Freq․ of APC,...,Th1-17 (CD4),Th1 (%CD4),Th17 (%CD4),Th2 (%CD4),Th2 (CD4),Trans-a (%B),Trans-b (%B),Trans-c (%B),Trans-d (%B),Treg (%CD4)
APO014,0.1333333,0.8,0.2666667,1.0,0.4666667,0.3333333,0.9333333,0.1333333,0.6666667,...,,,,,,,,,,
APO042,0.6666667,0.7333333,0.6,0.5666667,0.8,0.6,0.5333333,0.6,0.3333333,...,,,,,,,,,,
APO180,0.2,0.2,0.4333333,0.3333333,0.8666667,0.4666667,0.4666667,0.5333333,0.4666667,...,0.2142857,0.1875,0.2708333,0.6315789,0.5,0.5789474,0.6315789,0.5789474,0.8421053,0.2352941


In [17]:
# rownames(data$df)
# colnames(data$df)

In [18]:
df_final <- transpose(data$df, keep.names="marker", make.names="studyCode", fill=NA, ignore.empty=FALSE)
dim(df_final)
# rownames(data$df)
# colnames(data$df)
head(df_final, 2)
write_csv(df_final, "df_final_R.csv")

marker,APO014,APO042,APO180,APO189,APO249,APO279,APO342,APO360,APO511,...,TC271,TC272,TCH047,TCH048,TCH271,TCH272,WH008,WH025,WH043,WH044
Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freq· of LC,0.1333333,0.6666667,0.2,,,,,0.06666667,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 2 | Freq· of LC,0.8,0.7333333,0.2,,,,,0.13333333,,...,,,,,,,,,,


In [21]:
# data$cols_cal_self_percentile

### Search control group

In [None]:
search_control <- function(db, query='{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'){
    # To search control group
    # Input 
    # db: MongoDB object
    # query: a query  
    #    Eg. query <- '{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'
    # Output: a dataframe after searching
    df <- db$find(query)
    return (df)    
}

df_control <- search_control(db)
dim(df_control)
unique(df_control$individualCode)
# head(df_control, 1)

### Search individuals

In [None]:
study_codes = c("CPI515", "CPI464", "APO180", "NotExisted")  # c("APO180", "CPI515", "NotExisted") 

In [None]:
search_individuals <- function(db, vec){
    # To search individualCode
    # Input 
    # db: MongoDB object
    # vec: an array of study codes. 
    #      Eg. vec <- c("APO180", "CPI515", "NotExisted")
    #          query will be: '{"individualCode": {"$in": ["APO180","CPI515","NotExisted"]}}'
    # Output: a dataframe after searching
    
    # change to JSON
    vec <- toJSON(vec)
    # Create query
    query = paste0('{"individualCode": {"$in": ', vec, '}}')
    # Find
    df <- db$find(query)
    return (df)    
}

df <- search_individuals(db, vec=study_codes)
dim(df)
class(df)
# head(df, 3)
# tail(df, 3)

### Transform df

In [None]:
transform_df <- function(df){
    # Input df: a dataframe of only one individualCode
    # Output: new dataframe after transforming df with considering in markers and individualCode
    
    if (dim(df)[1] == 0){ # Empty dataframe
        return(df)
    }
    
    study_code <- unique(df$individualCode)[1]
    
    list_Samples <- df$Samples
    df_Samples <- rbindlist(list_Samples, fill=TRUE)
    list_markers <- df_Samples$markers
    df_markers <- rbindlist(list_markers, fill=TRUE)
    df_markers$studyCode <- study_code
    
    # remove "confidence" and "interpretation" columns
    df_markers <- subset(df_markers, select = -c(confidence, interpretation))
    return (df_markers)
}

df_markers_test <- transform_df(df)
# dim(df_markers_test)
head(df_markers_test, 2)
tail(df_markers_test, 2)
dim(df)

### Get all data after transforming df at each study code

In [None]:
filter_trans_df <- function(study_code, df){
    # Input 
    # df: dataframe getting from search_individuals()
    # study_code: a study code.
    #   Eg. study_code <- "CPI515"
    # Output: a dataframe after filter and transform
    
    if (dim(df)[1] == 0){ # Empty dataframe
        return(df)
    }
    df_filter <- df %>%
                filter(individualCode == study_code) %>%
                transform_df()
    return (df_filter)
}

study_codes=c("CPI515", "CPI464", "NotExisted")
df_filter <- filter_trans_df(study_codes[1], df)
head(df_filter, 2)
tail(df_filter, 2)
# head(df_filter, 3)
# tail(df_filter, 3)
# dim(df_filter)
# dim(df)

#### Concat list of df

In [None]:
study_codes <- unique(df$individualCode) # Get study code from df getting from DB

concat_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_concat <- bind_rows(list_dfs)                  
    return (df_concat)
}

df_concat <- concat_df(study_codes, df)
head(df_concat, 3)

#### Concat list of df and aggregate mean

In [None]:
study_codes <- unique(df$individualCode) # Get study code from df getting from DB

concat_agg_mean_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_agg <- bind_rows(list_dfs) %>%
                group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
                summarize(value = mean(value))            
    return (df_agg)
}

df_agg <- concat_agg_df(study_codes, df)
head(df_agg, 3)

#### Concat list of df and pivot (W/O aggreation before it)

In [None]:
# Get study code from df getting from DB
study_codes <- unique(df$individualCode)

concat_pivot_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_pivot <- bind_rows(list_dfs) %>%
                pivot_wider(names_from = studyCode, values_from = value)    
    return (df_pivot)
}

df_pivot <- concat_pivot_df(study_codes, df)
#df_pivot

#### Concat list of df, aggreation and pivot

In [None]:
study_codes <- unique(df$individualCode) # Get study code from df getting from DB

concat_agg_pivot_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_pivot <- bind_rows(list_dfs) %>%
                group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
                summarize(value = mean(value)) %>%  
                pivot_wider(names_from = studyCode, values_from = value)    
    return (df_pivot)
}
df_pivot <- concat_agg_pivot_df(study_codes, df)
df_pivot

In [None]:
library(dplyr)

df <- cbind(expand.grid(sciName=list("A", "B", "C"), family=list("X", "Y"), stage=list("S1", "S2", "S3", "S4")), count=1)
condition1 <- list(sciName="A", stageVector=c("S2", "S3"))
condition2 <- list(sciName="C", stageVector=c("S3", "S4"))
conditionList <- list(condition1, condition2)
filterStages <- function(condition, df) {
    subset.data <- df %>%
        dplyr::filter(sciName == condition$sciName) %>%
        dplyr::filter(stage %in% condition$stageVector)
    return(subset.data)
}
# filterStages(condition1, df)
# filterStages(condition2, df)
resultDataList <- lapply(conditionList, filterStages, df)
resultDataList

#### Save to csv for testing

In [None]:
write_csv(df_final, "df_final.csv")

In [None]:
class(df_final[1])

In [None]:
df_final %>%
    pivot_wider(names_from = studyCode, values_from = value)

## Code to keep just for references

### Create list of list: NOT used now

In [None]:
vec <- c(names(df_control))  # 'HBD001',....''HBD63',...
vec <- vec[! vec %in% c("name")] # remove "name"
create_list_individualCode <- function(study_codes){
    # Inputstudy_codes: a vector
    #   Eg. study_codes <- c("CPI515", "CPI464", "NotExisted") 
    # Output: a list of list
    # Example: Manual ways like below
    #   condition1 <- list(individualCode="CPI515")
    #   condition2 <- list(individualCode="CPI464")
    #   conditionList <- list(condition1, condition2)
    #   print(conditionList)
    #  [[1]]
    #  [[1]]$individualCode
    #  [1] "CPI515"
    # 
    # [[2]]
    # [[2]]$individualCode
    # [1] "CPI464"
    
    conditionList <- list()
    for (study_code in study_codes){
        conditionList <- append(conditionList, list(individualCode=study_code))
    }
    return (conditionList)
}

# Test
conditionList <- create_list_individualCode(study_codes=c("CPI515", "CPI464", "NotExisted"))
conditionList

In [None]:
# df

# 1. Create a query
# # https://www.r-bloggers.com/2016/10/difference-between-paste-and-paste0/
# # paste(): concatenate a series of strings
# # The difference between paste() and paste0() is that the argument sep by default is ” ” (paste) and “” (paste0).
# create_query <- function(vec){
#     query = paste0('{"individualCode": {"$in": ', vec, '}}')
#     return (query)
# }
# study_codes = c("HBD001", "GEM177", "APO180", "NotExistedInDB")
# study_codes
# study_codes <- toJSON(study_codes)
# study_codes
# query = paste0('{"individualCode": {"$in": ', study_codes, '}}')
# query
# query_1 = create_query(vec=study_codes)
# query_1

# 2. Find()
# individuals <- db$find(query)
# class(individuals)
# colnames(individuals)

# # Explicitly way
# df <- db$find('{"individualCode" : { "$in" : ["APO180", "CPI515", "NotExisted"] } }') # "APO180",
# # df <- db$find('{"individualCode" : "APO180"}') # CPI515
# colnames(df) 
# unique(df$individualCode)
# class(unique(df$individualCode))

---
# install.packages("mongolite")
# https://jeroen.github.io/mongolite/query-data.html#query-syntax
# install.packages("DBI")
# install.packages("RPostgres")
---
```{r}
library(mongolite) 
# library(DBI)
```
```{r}
# Connect to the database and the desired collection as root:
db <- mongo(collection = "markers", 
            db = "facs",
            url = "mongodb://localhost:27017"
            )
db$count('{}')
```

```{r}
ind_code <- db$find('{"individualCode" : "APO180"}')
print(ind_code)
```
<!-- 2. Mongolite basics -->
```{r}
# library(tidyverse)
library(knitr)     # help run code
library(markdown)  # create markdown files i.e. pdf
library(mongolite) # Create connection/Interface R<-> Mongodb
```
```{r}
# Create Connection: 'localhost'
mng_conn<-mongo(collection = 'vidPrac',db='video_practice')
```
```{r}
# INSERT
fun_dta <- c('{"first_name":"Mr Bilbo","last_name": "Baggins","hobbies":["find rings",
"adventure","magic"]}','{"first_name":"Golumn","hobbies":["steal rings","bite people",
"talk to myself"]}')

mng_conn$insert(fun_dta)
```
```{r}
# FIND
mng_conn$find('{}')
```
```{r}
# UPDATE
mng_conn$update('{"first_name":"Golumn"}', 
                '{"$set":{"last_name": "The Lonely"}}',upsert=TRUE)
```
```{r}
# FIND
mng_conn$find('{}')
```
```{r}
# Add Element to specific document array
# Cut and paste your unique OID Number!
mng_conn$find('{}',fields='{"_id":1}')
```
```{r}
mng_conn$aggregate('[{"$match":{"_id":{"$oid":"60f96426ab5100001f0067d3"}}},
{"$addFields":{"hobbies":{"$concatArrays":["$hobbies",["Eat Fish"]]}}}]')

mng_conn$find('{}')
```
```{r}
# Add Element to List
mng_conn$update('{"_id":{"$oid":"60f96426ab5100001f0067d3"}}',
'{"$push":{"hobbies":"Eat Fish"}}',upsert=TRUE)
```
```{r}
mng_conn$find('{}')
```

```{r}
# Sorting
mng_conn$find(sort='{"first_name":1,"latst_name":1}')
```
```{r}
# Select by ID
mng_conn$find('{"_id":{"$oid":"60f96426ab5100001f0067d3"}}')
```
```{r}
# Download files
# create directory, within our CWD to store output file
dir.create('output_files_practice')

# send file to our directory:
mng_conn$export(file("output_files_practice/hobbits.json"))
```
```{r}
# Delete
mng_conn$remove('{"_id":{"$oid":"60f96426ab5100001f0067d3"}}')

# Find
mng_conn$find('{}')
```
```{r}
#Drop All Records But NOT collection
mng_conn$remove('{}')

# Find
mng_conn$find('{}')
```

```{r}
# Drop Collection:
mng_conn$drop()
```
```{r}
mng_conn$find('{}')
```
```{r}
# Shell Commands:
# ------------------------
# 
# Show Databases: show dbs
# 
# Check What Database You're Currently In: db
# 
# Enter/Use a Database: use dbnamehere
# 
# If this database is not currently used, it will be created otherwise it will just connect
# Show Collections: show collections
# 
# If your Collection Name has a Weird Name like spaces or hypens: db.getCollection(" your weird name").find()
# 
# Switch Database without Leaving Current Database: db.getSiblingDB('fromCurrentDB')
# 
# This is particularly useful when writing a script and you cannot access a database using the use db method.
# Exit MongoDB: quit()
# 
# Help: db.help
# 
# This will show all of the functions/methods available to you
```

<!-- 3. INSERT df to Mongodb -->
```{r}
library(knitr)     # help run code
library(markdown)  # create markdown files i.e. pdf
library(dplyr)
library(mongolite) # Create connection/Interface R<-> Mongodb
library(jsonlite)  # send files to Mongo
```
```{r}
db <- mongo(collection = "test",
                  db = "R_test",
                  url = "mongodb://localhost:27017")

```
```{r}
# Queries: From Mongo to R
# find all rows:
n<-db$find('{}')

# get a glimpse (idea)
dplyr::glimpse(n)
```
```{r}
head(n)
```
```{r}

#Count all rows
db$count()
```
```{r}
name = c("AA", "BB", "CC")
age = c(10, 20, 18)
df = data.frame(name, age)
df
```
```{r}
db$insert(df)
```
```{r}
# Get df from csv file
df1 = data.table::fread("restaurant.csv")
df1
```
```{r} 
# get column names
names(df1)   
```
```{r}
# remove space from column  names
names(df1) = gsub(" ", "", names(df1))