In [52]:
library(mongolite) 
library(jsonlite)
library(data.table)
library(dplyr) # to get filter()
# library(tidyverse)
library(tidyr)
library(readr)

search_control <- function(db, 
                           query='{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'){
   df <- db$find(query)
   return (df)    
}

search_individuals <- function(db, vec){
   vec <- toJSON(vec)
   query = paste0('{"individualCode": {"$in": ', vec, '}}')
   df <- db$find(query)
   return (df)    
}


search_individuals_nin <- function(db, vec){
   vec <- toJSON(vec)
   query = paste0('{"individualCode": {"$nin": ', vec, '}}')
   df <- db$find(query)
   return (df)    
}


transform_df <- function(df){
   # Input df: a dataframe of only one individualCode
   # Output: new dataframe after transforming df
   
   if (dim(df)[1] == 0){ # Empty dataframe
      return(df)
   }
   
   study_code <- unique(df$individualCode)[1]
   
   list_Samples <- df$Samples
   df_Samples <- rbindlist(list_Samples, fill=TRUE)
   list_markers <- df_Samples$markers
   df_markers <- rbindlist(list_markers, fill=TRUE)
   df_markers$studyCode <- study_code
   
   # remove "confidence" and "interpretation" columns
   df_markers <- subset(df_markers, select = -c(confidence, interpretation))
   return (df_markers)
}


filter_transform_df <- function(study_code, df){
   
   if (dim(df)[1] == 0){ 
      return(df)
   }
   df_filter <- df %>%
      filter(individualCode == study_code) %>%
      transform_df()
   return (df_filter)
}


concat_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_trans_df, df)
   df_concat <- bind_rows(list_dfs)                  
   return (df_concat)
}


concat_agg_mean_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_trans_df, df)
   df_agg <- bind_rows(list_dfs) %>%
             group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
             summarize(value = mean(value))            
   return (df_agg)
}


concat_pivot_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_trans_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
               pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}


concat_agg_pivot_df <- function(study_codes, df){
   list_dfs <- lapply(study_codes, filter_trans_df, df)
   df_pivot <- bind_rows(list_dfs) %>%
               group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
               summarize(value = mean(value)) %>%  
               pivot_wider(names_from = studyCode, values_from = value)    
   return (df_pivot)
}

### Connect to MongoDB

In [2]:
db <- mongo(collection = "markers", 
            db = "facs",
            url = "mongodb://localhost:27017"
            )
# mongo<-mongolite::mongo(collection = "Sample", db = "Test", url = 
#                           "mongodb://User:123@Wyyuyu:13333/ty2_U",verbose = TRUE)

Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


In [39]:
db$count('{}')

## df control

In [44]:
df_control <- search_control(db)
study_codes <- unique(df_control$individualCode) # Get study code from df getting from DB, 
df_control <- concat_agg_pivot_df(study_codes, df_control)
dim(df_control)
df_control

`summarise()` has grouped output by 'studyCode'. You can override using the `.groups` argument.


name,HBD001,HBD003,HBD004,HBD011,HBD012,HBD025,HBD026,HBD033,HBD035,...,HBD122,HBD123,HBD124,HBD133,HBD141,HBD143,HBD153,HBD162,HBD177,HBD63
Activated CD4+ T cells(% CD4),0.8175,,0.51,,,,,1.86,,...,,,,,,,,,,
Activated CD8+ T cells (% CD8),0.5425,,0.48,,,,,2.59,,...,,,,,,,,,,
Anergic B (%B),7.0350,,8.04,,,,,44.40,,...,,,,,,,,,,
B cells (%Lymphocytes/live),13.7250,,14.40,,,,,13.60,,...,,,,,,,,,,
Bm (%B),5.7900,,11.60,,,,,7.18,,...,,,,,,,,,,
Bm (%Lymphocytes/live),0.7800,,1.67,,,,,0.98,,...,,,,,,,,,,
CD16+ mDCs (%APC),8.3975,,5.65,,,,,,,...,,,,,,,,,,
CD16neg mDCs (%APC),6.9425,,8.57,,,,,,,...,,,,,,,,,,
CD3 T cells (% Lymphocytes/live),62.8750,,74.50,,,,,75.80,,...,,,,,,,,,,
CD4+ T cells (%Lymphocytes/live),43.9750,,46.30,,,,,59.00,,...,,,,,,,,,,


## df_data

In [65]:
vec <- c(names(df_control))  # 'HBD001',....''HBD63',...
vec <- vec[! vec %in% c("name")] # remove "name"

df_data <- search_individuals_nin(db, vec)
study_codes <- unique(df_data$individualCode)  # 'GEM177',..., 'CPI018',...
study_codes <- study_codes[! study_codes %in% c('AMCS20001A', 'AMCS20006A', 'AMCS21027A', 'AMCS20002A')]
                # study_codes = c("CPI515", "CPI464", "APO180", "GEM177", "NotExisted")
df_data <- concat_agg_pivot_df(study_codes, df_data)
dim(df_data)
df_data

`summarise()` has grouped output by 'studyCode'. You can override using the `.groups` argument.


name,APO014,APO042,APO180,APO189,APO249,APO279,APO342,APO360,APO511,...,TC271,TC272,TCH047,TCH048,TCH271,TCH272,WH008,WH025,WH043,WH044
Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freq· of LC,0.33,0.97,0.410,,,,,0.310,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 2 | Freq· of LC,13.80,13.40,4.300,,,,,4.040,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/Nk 3 | Freq· of LC,0.69,0.95,0.760,,,,,0.130,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 4 | Freq· of LC,5.83,0.50,0.370,,,,,0.810,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/Classical monocytes | Freq· of APC,23.90,53.20,60.500,,,,,71.900,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC | Freq· of APC,18.60,26.30,22.900,,,,,16.700,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC/CD16- mDC | Freq· of APC,15.80,6.66,6.230,,,,,12.100,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/mDC/CD16+ mDC | Freq· of APC,2.77,19.60,16.600,,,,,4.560,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/DC/pDC | Freq· of APC,4.42,2.73,2.970,,,,,3.240,,...,,,,,,,,,,
Single Cells/Single Cells/live/LC and Mono/non BT/CD56-/APC/LDN | Freq· of APC,31.80,0.26,0.034,,,,,0.058,,...,,,,,,,,,,


### Search control group

In [4]:
search_control <- function(db, query='{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'){
    # To search control group
    # Input 
    # db: MongoDB object
    # query: a query  
    #    Eg. query <- '{"individualCode": {"$regex" : "^HBD|^APOC", "$options" : "i"}}'
    # Output: a dataframe after searching
    df <- db$find(query)
    return (df)    
}

df_control <- search_control(db)
dim(df_control)
unique(df_control$individualCode)
# head(df_control, 1)

### Search individuals

In [5]:
study_codes = c("CPI515", "CPI464", "APO180", "NotExisted")  # c("APO180", "CPI515", "NotExisted") 

In [6]:
search_individuals <- function(db, vec){
    # To search individualCode
    # Input 
    # db: MongoDB object
    # vec: an array of study codes. 
    #      Eg. vec <- c("APO180", "CPI515", "NotExisted")
    #          query will be: '{"individualCode": {"$in": ["APO180","CPI515","NotExisted"]}}'
    # Output: a dataframe after searching
    
    # change to JSON
    vec <- toJSON(vec)
    # Create query
    query = paste0('{"individualCode": {"$in": ', vec, '}}')
    # Find
    df <- db$find(query)
    return (df)    
}

df <- search_individuals(db, vec=study_codes)
dim(df)
class(df)
# head(df, 3)
# tail(df, 3)

### Transform df

In [7]:
transform_df <- function(df){
    # Input df: a dataframe of only one individualCode
    # Output: new dataframe after transforming df with considering in markers and individualCode
    
    if (dim(df)[1] == 0){ # Empty dataframe
        return(df)
    }
    
    study_code <- unique(df$individualCode)[1]
    
    list_Samples <- df$Samples
    df_Samples <- rbindlist(list_Samples, fill=TRUE)
    list_markers <- df_Samples$markers
    df_markers <- rbindlist(list_markers, fill=TRUE)
    df_markers$studyCode <- study_code
    
    # remove "confidence" and "interpretation" columns
    df_markers <- subset(df_markers, select = -c(confidence, interpretation))
    return (df_markers)
}

df_markers_test <- transform_df(df)
# dim(df_markers_test)
head(df_markers_test, 2)
tail(df_markers_test, 2)
dim(df)

name,value,studyCode
CD3 T cells (% Lymphocytes/live),65.3,APO180
CD4+ T cells (%Lymphocytes/live),28.8,APO180


name,value,studyCode
singlets/Single Cells/live/Lymphocytes/CD3+/CD8+ T cells/TEM | Freq· of CD8+ T cells,28.6,APO180
singlets/Single Cells/live/Lymphocytes/CD3+/CD8+ T cells/TEMRA | Freq· of CD8+ T cells,10.1,APO180


### Get all data after transforming df at each study code

In [8]:
filter_trans_df <- function(study_code, df){
    # Input 
    # df: dataframe getting from search_individuals()
    # study_code: a study code.
    #   Eg. study_code <- "CPI515"
    # Output: a dataframe after filter and transform
    
    if (dim(df)[1] == 0){ # Empty dataframe
        return(df)
    }
    df_filter <- df %>%
                filter(individualCode == study_code) %>%
                transform_df()
    return (df_filter)
}

study_codes=c("CPI515", "CPI464", "NotExisted")
df_filter <- filter_trans_df(study_codes[1], df)
head(df_filter, 2)
tail(df_filter, 2)
# head(df_filter, 3)
# tail(df_filter, 3)
# dim(df_filter)
# dim(df)

name,value,studyCode
NK (%LC),22.13,CPI515
NK- 1 (%LC),0.75,CPI515


name,value,studyCode
TEM (% CD8),24.4,CPI515
TEMRA (% CD8),25.0,CPI515


#### Concat list of df

In [23]:
study_codes <- unique(df$individualCode) # Get study code from df getting from DB

concat_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_concat <- bind_rows(list_dfs)                  
    return (df_concat)
}

df_concat <- concat_df(study_codes, df)
head(df_concat, 3)

name,value,studyCode
CD3 T cells (% Lymphocytes/live),65.3,APO180
CD4+ T cells (%Lymphocytes/live),28.8,APO180
Activated CD4+ T cells(% CD4),0.74,APO180


#### Concat list of df and aggregate mean

In [32]:
study_codes <- unique(df$individualCode) # Get study code from df getting from DB

concat_agg_mean_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_agg <- bind_rows(list_dfs) %>%
                group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
                summarize(value = mean(value))            
    return (df_agg)
}

df_agg <- concat_agg_df(study_codes, df)
head(df_agg, 3)

`summarise()` has grouped output by 'studyCode'. You can override using the `.groups` argument.


studyCode,name,value
APO180,Activated CD4+ T cells(% CD4),0.65
APO180,Activated CD8+ T cells (% CD8),0.9744444
APO180,Anergic B (%B),6.9222222


#### Concat list of df and pivot (W/O aggreation before it)

In [36]:
# Get study code from df getting from DB
study_codes <- unique(df$individualCode)

concat_pivot_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_pivot <- bind_rows(list_dfs) %>%
                pivot_wider(names_from = studyCode, values_from = value)    
    return (df_pivot)
}

df_pivot <- concat_pivot_df(study_codes, df)
#df_pivot

"Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates"

#### Concat list of df, aggreation and pivot

In [37]:
study_codes <- unique(df$individualCode) # Get study code from df getting from DB

concat_agg_pivot_df <- function(study_codes, df){
    list_dfs <- lapply(study_codes, filter_trans_df, df)
    df_pivot <- bind_rows(list_dfs) %>%
                group_by_at(vars(one_of(c( "studyCode", "name")))) %>%
                summarize(value = mean(value)) %>%  
                pivot_wider(names_from = studyCode, values_from = value)    
    return (df_pivot)
}
df_pivot <- concat_agg_pivot_df(study_codes, df)
df_pivot

`summarise()` has grouped output by 'studyCode'. You can override using the `.groups` argument.


name,APO180,CPI464,CPI515
Activated CD4+ T cells(% CD4),0.6500000,0.93,0.37
Activated CD8+ T cells (% CD8),0.9744444,2.19,0.31
Anergic B (%B),6.9222222,20.80,13.90
B-CD21lo (%B),8.2275000,,
B-mem (%B),15.7500000,,
B-mem (%LC),1.3558333,,
B-MZ (%B),7.0191667,,
B-MZ (%LC),0.6033333,,
B-SM-IgA (%B),2.5841667,,
B-SM-IgG (%B),3.7158333,,


In [None]:
library(dplyr)

df <- cbind(expand.grid(sciName=list("A", "B", "C"), family=list("X", "Y"), stage=list("S1", "S2", "S3", "S4")), count=1)
condition1 <- list(sciName="A", stageVector=c("S2", "S3"))
condition2 <- list(sciName="C", stageVector=c("S3", "S4"))
conditionList <- list(condition1, condition2)
filterStages <- function(condition, df) {
    subset.data <- df %>%
        dplyr::filter(sciName == condition$sciName) %>%
        dplyr::filter(stage %in% condition$stageVector)
    return(subset.data)
}
# filterStages(condition1, df)
# filterStages(condition2, df)
resultDataList <- lapply(conditionList, filterStages, df)
resultDataList

#### Save to csv for testing

In [None]:
write_csv(df_final, "df_final.csv")

In [None]:
class(df_final[1])

In [None]:
df_final %>%
    pivot_wider(names_from = studyCode, values_from = value)

## Code to keep just for references

### Create list of list: NOT used now

In [None]:
create_list_individualCode <- function(study_codes){
    # Inputstudy_codes: a vector
    #   Eg. study_codes <- c("CPI515", "CPI464", "NotExisted") 
    # Output: a list of list
    # Example: Manual ways like below
    #   condition1 <- list(individualCode="CPI515")
    #   condition2 <- list(individualCode="CPI464")
    #   conditionList <- list(condition1, condition2)
    #   print(conditionList)
    #  [[1]]
    #  [[1]]$individualCode
    #  [1] "CPI515"
    # 
    # [[2]]
    # [[2]]$individualCode
    # [1] "CPI464"
    
    conditionList <- list()
    for (study_code in study_codes){
        conditionList <- append(conditionList, list(individualCode=study_code))
    }
    return (conditionList)
}

# Test
conditionList <- create_list_individualCode(study_codes=c("CPI515", "CPI464", "NotExisted"))
conditionList

In [None]:
# df

# 1. Create a query
# # https://www.r-bloggers.com/2016/10/difference-between-paste-and-paste0/
# # paste(): concatenate a series of strings
# # The difference between paste() and paste0() is that the argument sep by default is ” ” (paste) and “” (paste0).
# create_query <- function(vec){
#     query = paste0('{"individualCode": {"$in": ', vec, '}}')
#     return (query)
# }
# study_codes = c("HBD001", "GEM177", "APO180", "NotExistedInDB")
# study_codes
# study_codes <- toJSON(study_codes)
# study_codes
# query = paste0('{"individualCode": {"$in": ', study_codes, '}}')
# query
# query_1 = create_query(vec=study_codes)
# query_1

# 2. Find()
# individuals <- db$find(query)
# class(individuals)
# colnames(individuals)

# # Explicitly way
# df <- db$find('{"individualCode" : { "$in" : ["APO180", "CPI515", "NotExisted"] } }') # "APO180",
# # df <- db$find('{"individualCode" : "APO180"}') # CPI515
# colnames(df) 
# unique(df$individualCode)
# class(unique(df$individualCode))

---
# install.packages("mongolite")
# https://jeroen.github.io/mongolite/query-data.html#query-syntax
# install.packages("DBI")
# install.packages("RPostgres")
---
```{r}
library(mongolite) 
# library(DBI)
```
```{r}
# Connect to the database and the desired collection as root:
db <- mongo(collection = "markers", 
            db = "facs",
            url = "mongodb://localhost:27017"
            )
db$count('{}')
```

```{r}
ind_code <- db$find('{"individualCode" : "APO180"}')
print(ind_code)
```
<!-- 2. Mongolite basics -->
```{r}
# library(tidyverse)
library(knitr)     # help run code
library(markdown)  # create markdown files i.e. pdf
library(mongolite) # Create connection/Interface R<-> Mongodb
```
```{r}
# Create Connection: 'localhost'
mng_conn<-mongo(collection = 'vidPrac',db='video_practice')
```
```{r}
# INSERT
fun_dta <- c('{"first_name":"Mr Bilbo","last_name": "Baggins","hobbies":["find rings",
"adventure","magic"]}','{"first_name":"Golumn","hobbies":["steal rings","bite people",
"talk to myself"]}')

mng_conn$insert(fun_dta)
```
```{r}
# FIND
mng_conn$find('{}')
```
```{r}
# UPDATE
mng_conn$update('{"first_name":"Golumn"}', 
                '{"$set":{"last_name": "The Lonely"}}',upsert=TRUE)
```
```{r}
# FIND
mng_conn$find('{}')
```
```{r}
# Add Element to specific document array
# Cut and paste your unique OID Number!
mng_conn$find('{}',fields='{"_id":1}')
```
```{r}
mng_conn$aggregate('[{"$match":{"_id":{"$oid":"60f96426ab5100001f0067d3"}}},
{"$addFields":{"hobbies":{"$concatArrays":["$hobbies",["Eat Fish"]]}}}]')

mng_conn$find('{}')
```
```{r}
# Add Element to List
mng_conn$update('{"_id":{"$oid":"60f96426ab5100001f0067d3"}}',
'{"$push":{"hobbies":"Eat Fish"}}',upsert=TRUE)
```
```{r}
mng_conn$find('{}')
```

```{r}
# Sorting
mng_conn$find(sort='{"first_name":1,"latst_name":1}')
```
```{r}
# Select by ID
mng_conn$find('{"_id":{"$oid":"60f96426ab5100001f0067d3"}}')
```
```{r}
# Download files
# create directory, within our CWD to store output file
dir.create('output_files_practice')

# send file to our directory:
mng_conn$export(file("output_files_practice/hobbits.json"))
```
```{r}
# Delete
mng_conn$remove('{"_id":{"$oid":"60f96426ab5100001f0067d3"}}')

# Find
mng_conn$find('{}')
```
```{r}
#Drop All Records But NOT collection
mng_conn$remove('{}')

# Find
mng_conn$find('{}')
```

```{r}
# Drop Collection:
mng_conn$drop()
```
```{r}
mng_conn$find('{}')
```
```{r}
# Shell Commands:
# ------------------------
# 
# Show Databases: show dbs
# 
# Check What Database You're Currently In: db
# 
# Enter/Use a Database: use dbnamehere
# 
# If this database is not currently used, it will be created otherwise it will just connect
# Show Collections: show collections
# 
# If your Collection Name has a Weird Name like spaces or hypens: db.getCollection(" your weird name").find()
# 
# Switch Database without Leaving Current Database: db.getSiblingDB('fromCurrentDB')
# 
# This is particularly useful when writing a script and you cannot access a database using the use db method.
# Exit MongoDB: quit()
# 
# Help: db.help
# 
# This will show all of the functions/methods available to you
```

<!-- 3. INSERT df to Mongodb -->
```{r}
library(knitr)     # help run code
library(markdown)  # create markdown files i.e. pdf
library(dplyr)
library(mongolite) # Create connection/Interface R<-> Mongodb
library(jsonlite)  # send files to Mongo
```
```{r}
db <- mongo(collection = "test",
                  db = "R_test",
                  url = "mongodb://localhost:27017")

```
```{r}
# Queries: From Mongo to R
# find all rows:
n<-db$find('{}')

# get a glimpse (idea)
dplyr::glimpse(n)
```
```{r}
head(n)
```
```{r}

#Count all rows
db$count()
```
```{r}
name = c("AA", "BB", "CC")
age = c(10, 20, 18)
df = data.frame(name, age)
df
```
```{r}
db$insert(df)
```
```{r}
# Get df from csv file
df1 = data.table::fread("restaurant.csv")
df1
```
```{r} 
# get column names
names(df1)   
```
```{r}
# remove space from column  names
names(df1) = gsub(" ", "", names(df1))