In [1]:
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



## I. Dataframe basics

In [11]:
df <- data.frame(label = c('a1', 'a2', 'a3', 'a4', 'a5'),
                 x1 = c(1, 2, 3, NA, NA), 
                 x2 = c(100, NA, 300, 400, NA))
df

label,x1,x2
a1,1.0,100.0
a2,2.0,
a3,3.0,300.0
a4,,400.0
a5,,


### Get unique values of a column

In [13]:
labels = unique(df$label)
labels
class(labels)

### Access df by column names

In [3]:
df_sub2 <- df[c('x1', 'x2')]
df_sub2

x1,x2
1.0,100.0
2.0,
3.0,300.0
,400.0
,


In [4]:
df_sub1 <- df['x1']
df_sub1
class(df_sub1)

x1
1.0
2.0
3.0
""
""


### Set index for df instead of integer index

In [5]:
row.names(df) <- df$label
df

Unnamed: 0,label,x1,x2
a1,a1,1.0,100.0
a2,a2,2.0,
a3,a3,3.0,300.0
a4,a4,,400.0
a5,a5,,


### Remove a column in df

In [6]:
df <- subset(df, select = -c(label))
df

Unnamed: 0,x1,x2
a1,1.0,100.0
a2,2.0,
a3,3.0,300.0
a4,,400.0
a5,,


### remove "rows" in a dataframe with a NA value in one of the columns

In [7]:
df1 = na.omit(df)
df1

Unnamed: 0,x1,x2
a1,1,100
a3,3,300


### remove "rows" in a dataframe with all NA

In [8]:
df %>% filter_all(any_vars(!is.na(.)))

x1,x2
1.0,100.0
2.0,
3.0,300.0
,400.0


### Get column names

In [10]:
# col_names = colnames(df)
col_names = names(df)
col_names
class(col_names)

## II. Transform multi-level JSON to Dataframe

### 1. Make a raw json string for testing

In [37]:
library(jsonlite)

json_str <- '{
    "_id" : "AAA1",
    "individualCode" : "HBD001",
    "Samples" : [ 
        {
            "filename" : "PBMCs_APC HBD001 ACD-A 17apr18_088․fcs",
            "project" : "CPI",
            "markers" : [ 
                {
                    "name" : "NK (%LC)",
                    "value" : 17.92,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "NK- 1 (%LC)",
                    "value" : 1.35,
                    "confidence" : "",
                    "interpretation" : ""
                }              
            ]
        }, 
        {
            "filename" : "PBMCs_Th cell HBD001 ACD-A 18apr18_045․fcs",
            "project" : "CPI",
            "markers" : [ 
                {
                    "name" : "Exhausted (%CD4)",
                    "value" : 0.29,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "R5 Th1 (%CD4)",
                    "value" : 1.23,
                    "confidence" : "",
                    "interpretation" : ""
                }
            ]
        }
    ]
}'

json_str

### 2. Convert json string to R object

In [38]:
list_json <- fromJSON(json_str)
list_json

filename,project,markers
PBMCs_APC HBD001 ACD-A 17apr18_088·fcs,CPI,"NK (%LC) , NK- 1 (%LC), 17.92 , 1.35 , , , ,"
PBMCs_Th cell HBD001 ACD-A 18apr18_045·fcs,CPI,"Exhausted (%CD4), R5 Th1 (%CD4) , 0.29 , 1.23 , , , ,"


### 3. Convert R list object to DataFrame

In [43]:
df <- do.call("cbind", list_json)
df

_id,individualCode,Samples.filename,Samples.project,Samples.markers
AAA1,HBD001,PBMCs_APC HBD001 ACD-A 17apr18_088·fcs,CPI,"NK (%LC) , NK- 1 (%LC), 17.92 , 1.35 , , , ,"
AAA1,HBD001,PBMCs_Th cell HBD001 ACD-A 18apr18_045·fcs,CPI,"Exhausted (%CD4), R5 Th1 (%CD4) , 0.29 , 1.23 , , , ,"


### 4.  DataFrame of markers (Optional)

In [46]:
# Check
list_markers <- df$Samples.markers
list_markers

name,value,confidence,interpretation
NK (%LC),17.92,,
NK- 1 (%LC),1.35,,

name,value,confidence,interpretation
Exhausted (%CD4),0.29,,
R5 Th1 (%CD4),1.23,,


In [47]:
library(data.table)

df_markers <- rbindlist(list_markers, fill=TRUE)
df_markers


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

The following object is masked from 'package:purrr':

    transpose



name,value,confidence,interpretation
NK (%LC),17.92,,
NK- 1 (%LC),1.35,,
Exhausted (%CD4),0.29,,
R5 Th1 (%CD4),1.23,,


In [50]:
# at df_markers: change column name to HBD001
colnames(df_markers)[colnames(df_markers) == 'value'] <- 'HBD001'
df_markers

name,HBD001,confidence,interpretation
NK (%LC),17.92,,
NK- 1 (%LC),1.35,,
Exhausted (%CD4),0.29,,
R5 Th1 (%CD4),1.23,,


### Create a function

In [65]:
transform_df <- function(df, newColname){
    list_Samples <- df$Samples
    df_Samples <- rbindlist(list_Samples, fill=TRUE)
    list_markers <- df_Samples$markers
    df_markers <- rbindlist(list_markers, fill=TRUE)
    colnames(df_markers)[colnames(df_markers) == 'value'] <- newColname
    return (df_markers)
}

## IV. Merge two Dataframes  
```
cbind() – combining the columns of two data frames side-by-side
rbind() – stacking two data frames on top of each other, appending one to the other
merge() – joining two data frames using a common column
```

### 1. Merge

In [57]:
df_authors <- data.frame(
    surname = c("AA", "BB", "EE"),
    nationality = c("US", "Australia", "US"),
    retired = c("yes", rep("no", 2)))
df_authors

surname,nationality,retired
AA,US,yes
BB,Australia,no
EE,US,no


In [60]:
df_books <- data.frame(
    name = c("AA", "DD", "BB"),
    title = c("Title1", NA, "Title3"))
df_books

name,title
AA,Title1
DD,
BB,Title3


In [62]:
df_merge <- merge(x=df_authors, 
                  y=df_books, 
                  by.x="surname", 
                  by.y="name",
                  all=TRUE  # ALL: Outer Join, all.x= TRUE: Left Join, all.y=TRUE: Right join, No specify: Inner join
                            # NOTE: Cross join: merge(x = df1, y = df2, by = NULL)
                 )
df_merge

surname,nationality,retired,title
AA,US,yes,Title1
BB,Australia,no,Title3
EE,US,no,
DD,,,


### 2. rbind and bind_rows

In [76]:
df1 <- data.frame(a=c(0,1,2), b=c(3,4,5), c=c(6,7,8))
df1

a,b,c
0,3,6
1,4,7
2,5,8


In [77]:
df2 <- data.frame(a=c(9,10), c=c(12,13))
df2

a,c
9,12
10,13


In [71]:
df2$b <- NA  # make a new column with all NA
df2

a,c,b
9,12,
10,13,


In [73]:
rbind(df1, df2)   # rbind() requires df1, df2 have the same columns

a,b,c
0,3.0,6
1,4.0,7
2,5.0,8
9,,12
10,,13


In [79]:
df3 <- data.frame(a=c(9,10), c=c(12,13))
df3

a,c
9,12
10,13


In [80]:
bind_rows(df1, df3)  # from the dplyr library

a,b,c
0,3.0,6
1,4.0,7
2,5.0,8
9,,12
10,,13


In [15]:
sample.df <- data.frame(id = c(101, 102, 103, 104),
                    json_col = c('[{"foo_a":"bar"}]',
                                 '[{"foo_a":"bar","foo_b":"bar"}]',
                                 '[{"foo_a":"bar","foo_c":2}]',
                                 '[{"foo_a":"bar","foo_b":"bar","foo_c":2,"nested_col":{"foo_d":"bar","foo_e":3}}]'),
                    startdate = as.Date(c('2010-11-1','2008-3-25','2007-3-14','2006-2-21')))
sample.df

id,json_col,startdate
101,"[{""foo_a"":""bar""}]",2010-11-01
102,"[{""foo_a"":""bar"",""foo_b"":""bar""}]",2008-03-25
103,"[{""foo_a"":""bar"",""foo_c"":2}]",2007-03-14
104,"[{""foo_a"":""bar"",""foo_b"":""bar"",""foo_c"":2,""nested_col"":{""foo_d"":""bar"",""foo_e"":3}}]",2006-02-21


In [17]:
library(dplyr)
library(tidyr)
library(purrr)
library(jsonlite)

sample.df %>%
  mutate(
    json_parsed = map(json_col, ~ fromJSON(., flatten=TRUE))
  ) %>%
  unnest(json_parsed)

id,json_col,startdate,foo_a,foo_b,foo_c,nested_col.foo_d,nested_col.foo_e
101,"[{""foo_a"":""bar""}]",2010-11-01,bar,,,,
102,"[{""foo_a"":""bar"",""foo_b"":""bar""}]",2008-03-25,bar,bar,,,
103,"[{""foo_a"":""bar"",""foo_c"":2}]",2007-03-14,bar,,2.0,,
104,"[{""foo_a"":""bar"",""foo_b"":""bar"",""foo_c"":2,""nested_col"":{""foo_d"":""bar"",""foo_e"":3}}]",2006-02-21,bar,bar,2.0,bar,3.0


In [20]:
library(dplyr)
library(jsonlite)

new.df <- sample.df %>% 
          rowwise() %>%
          do(data.frame(fromJSON(.$json_col, flatten = T))) %>%
          ungroup() %>%
          bind_cols(sample.df %>% select(-json_col))

new.df

foo_a,foo_b,foo_c,nested_col.foo_d,nested_col.foo_e,id,startdate
bar,,,,,101,2010-11-01
bar,bar,,,,102,2008-03-25
bar,,2.0,,,103,2007-03-14
bar,bar,2.0,bar,3.0,104,2006-02-21


In [22]:
df1 <- data.frame(
    "individualCode" = "HBD001",
    "runId" = "CPI_28062018",
   
    "folder" = "/jcsmr/CPI/Fulcher/FACS files/CPI_20180628",
    "Samples" : [ 
        {
            "filename" : "PBMCs_APC HBD001 ACD-A 17apr18_088․fcs",
            "datetime" : "2018-06-29",
            "technician" : "RobT",
            "instrumentModel" : "LSRII",
            "instrumentID" : "1",
            "project" : "CPI",
            "software" : "BD FACSDiva Software Version 8.0.1",
            "markers" : [ 
                {
                    "name" : "NK (%LC)",
                    "value" : 17.92,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "Classical Monocytes(%APC)",
                    "value" : 47.9,
                    "confidence" : "",
                    "interpretation" : ""
                }
              
            ]
        }
        ])
       
df1

ERROR: Error in parse(text = x, srcfile = src): <text>:7:17: unexpected '['
6:     "folder" : "/jcsmr/CPI/Fulcher/FACS files/CPI_20180628",
7:     "Samples" : [
                   ^
