In [None]:
library(dplyr)

## I. Dataframe basics

In [None]:
df <- data.frame(label = c('a1', 'a2', 'a3', 'a4', 'a5'),
                 x1 = c(1, 2, 3, NA, NA), 
                 x2 = c(100, NA, 300, 400, NA))
df

### Get unique values of a column

In [None]:
labels = unique(df$label)
labels
class(labels)

### Access df by column names

In [None]:
df_sub2 <- df[c('x1', 'x2')]
df_sub2

In [None]:
df_sub1 <- df['x1']
df_sub1
class(df_sub1)

### Set index for df instead of integer index

In [None]:
row.names(df) <- df$label
df

### Remove a column in df

In [None]:
df <- subset(df, select = -c(label))
df

### remove "rows" in a dataframe with a NA value in one of the columns

In [None]:
df1 = na.omit(df)
df1

### remove "rows" in a dataframe with all NA

In [None]:
df %>% filter_all(any_vars(!is.na(.)))

### Get column names

In [None]:
# col_names = colnames(df)
col_names = names(df)
col_names
class(col_names)

### Create list of dataframe

In [None]:
df1 <- data.frame(value = c(1,4,3), name = c(1,1,1))
df2 <- data.frame(value = c(4,3,7), name = c(1,0,1))

# Explicit way to make a list of df
list_df <- list(df1, df2)
list_df

In [None]:
# Append() to make a list of df
vec = c()
vec <- append(vec, list(df1))
vec
vec <- append(vec, list(df2))
vec

In [None]:
# Concat list of df to be one df. rbind() require all df has the same column names
do.call("rbind", vec)

## II. Transform multi-level JSON to Dataframe

### 1. Make a raw json string for testing

In [None]:
library(jsonlite)

json_str <- '{
    "_id" : "AAA1",
    "individualCode" : "HBD001",
    "Samples" : [ 
        {
            "filename" : "PBMCs_APC HBD001 ACD-A 17apr18_088․fcs",
            "project" : "CPI",
            "markers" : [ 
                {
                    "name" : "NK (%LC)",
                    "value" : 17.92,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "NK- 1 (%LC)",
                    "value" : 1.35,
                    "confidence" : "",
                    "interpretation" : ""
                }              
            ]
        }, 
        {
            "filename" : "PBMCs_Th cell HBD001 ACD-A 18apr18_045․fcs",
            "project" : "CPI",
            "markers" : [ 
                {
                    "name" : "Exhausted (%CD4)",
                    "value" : 0.29,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "R5 Th1 (%CD4)",
                    "value" : 1.23,
                    "confidence" : "",
                    "interpretation" : ""
                }
            ]
        }
    ]
}'

json_str

### 2. Convert json string to R object

In [None]:
list_json <- fromJSON(json_str)
list_json

### 3. Convert R list object to DataFrame

In [None]:
df <- do.call("cbind", list_json)
df

### 4.  DataFrame of markers (Optional)

In [None]:
# Check
list_markers <- df$Samples.markers
list_markers

In [None]:
library(data.table)

df_markers <- rbindlist(list_markers, fill=TRUE)
df_markers

In [None]:
# at df_markers: change column name to HBD001
colnames(df_markers)[colnames(df_markers) == 'value'] <- 'HBD001'
df_markers

### Create a function

In [None]:
transform_df <- function(df, newColname){
    list_Samples <- df$Samples
    df_Samples <- rbindlist(list_Samples, fill=TRUE)
    list_markers <- df_Samples$markers
    df_markers <- rbindlist(list_markers, fill=TRUE)
    colnames(df_markers)[colnames(df_markers) == 'value'] <- newColname
    return (df_markers)
}

## III. Merge two Dataframes  
```
cbind() – combining the columns of two data frames side-by-side
rbind() – stacking two data frames on top of each other, appending one to the other
merge() – joining two data frames using a common column
```

### 1. Merge

In [None]:
df_authors <- data.frame(
    surname = c("AA", "BB", "EE"),
    nationality = c("US", "Australia", "US"),
    retired = c("yes", rep("no", 2)))
df_authors

In [None]:
df_books <- data.frame(
    name = c("AA", "DD", "BB"),
    title = c("Title1", NA, "Title3"))
df_books

In [None]:
df_merge <- merge(x=df_authors, 
                  y=df_books, 
                  by.x="surname", 
                  by.y="name",
                  all=TRUE  # ALL: Outer Join, all.x= TRUE: Left Join, all.y=TRUE: Right join, No specify: Inner join
                            # NOTE: Cross join: merge(x = df1, y = df2, by = NULL)
                 )
df_merge

### 2. rbind and bind_rows

In [None]:
df1 <- data.frame(a=c(0,1,2), b=c(3,4,5), c=c(6,7,8))
df1

In [None]:
df2 <- data.frame(a=c(9,10), c=c(12,13))
df2

In [None]:
df2$b <- NA  # make a new column with all NA
df2

In [None]:
rbind(df1, df2)   # rbind() requires df1, df2 have the same columns

In [None]:
df3 <- data.frame(a=c(9,10), c=c(12,13))
df3

In [None]:
bind_rows(df1, df3)  # from the dplyr library

## IV. filter dataframe

In [None]:
library(dplyr)

df <- cbind(expand.grid(sciName=list("A", "B", "C"), family=list("X", "Y"), stage=list("S1", "S2", "S3", "S4")), count=1)
df

In [None]:
# set up our filter conditions
condition1 <- list(sciName="A", stageVector=c("S2", "S3"))
condition2 <- list(sciName="C", stageVector=c("S3", "S4"))
conditionList <- list(condition1, condition2)
conditionList

In [None]:
# Define the filtering function
filterStages <- function(condition, df) {
    subset.data <- df %>%
        filter(sciName == condition$sciName) %>%
        filter(stage %in% condition$stageVector)
    return(subset.data)
}

In [None]:
# demo the filter function working on a single condition at a time
filterStages(condition1, df)
filterStages(condition2, df)

In [None]:
# demo the filter function working over a list of conditions
resultDataList <- lapply(conditionList, filterStages, df)
resultDataList

## V. Pivot dataframe

In [None]:
NUM <- c("45", "45", "45", "45", "48", "50", "66", "66", "66", "68")
Type <- c("A", "F", "C", "B", "D", "A", "E", "C", "F", "D")
Points <- c(9.2,60.8,22.9,1012.7,18.7,11.1,67.2,63.1,16.7,58.4)
df <- data.frame(NUM, Type, Points)
df

#### pivot_wider() function

In [None]:
install.packages("tidyr")

In [None]:
library(tidyr)
package_version(R.version)
print(sessionInfo())
print(version)

In [None]:
library(dplyr)
library(readr)
# library(tidyverse)

df %>%
    pivot_wider(names_from = Type, values_from = Points)

In [None]:
sample.df <- data.frame(id = c(101, 102, 103, 104),
                    json_col = c('[{"foo_a":"bar"}]',
                                 '[{"foo_a":"bar","foo_b":"bar"}]',
                                 '[{"foo_a":"bar","foo_c":2}]',
                                 '[{"foo_a":"bar","foo_b":"bar","foo_c":2,"nested_col":{"foo_d":"bar","foo_e":3}}]'),
                    startdate = as.Date(c('2010-11-1','2008-3-25','2007-3-14','2006-2-21')))
sample.df

In [None]:
library(dplyr)
library(tidyr)
library(purrr)
library(jsonlite)

sample.df %>%
  mutate(
    json_parsed = map(json_col, ~ fromJSON(., flatten=TRUE))
  ) %>%
  unnest(json_parsed)

In [None]:
library(dplyr)
library(jsonlite)

new.df <- sample.df %>% 
          rowwise() %>%
          do(data.frame(fromJSON(.$json_col, flatten = T))) %>%
          ungroup() %>%
          bind_cols(sample.df %>% select(-json_col))

new.df

In [None]:
df1 <- data.frame(
    "individualCode" = "HBD001",
    "runId" = "CPI_28062018",
   
    "folder" = "/jcsmr/CPI/Fulcher/FACS files/CPI_20180628",
    "Samples" : [ 
        {
            "filename" : "PBMCs_APC HBD001 ACD-A 17apr18_088․fcs",
            "datetime" : "2018-06-29",
            "technician" : "RobT",
            "instrumentModel" : "LSRII",
            "instrumentID" : "1",
            "project" : "CPI",
            "software" : "BD FACSDiva Software Version 8.0.1",
            "markers" : [ 
                {
                    "name" : "NK (%LC)",
                    "value" : 17.92,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "Classical Monocytes(%APC)",
                    "value" : 47.9,
                    "confidence" : "",
                    "interpretation" : ""
                }
              
            ]
        }
        ])
       
df1

#### Code to keep

In [None]:
<<<<<<< HEAD
---
title: "R Notebook"
output: html_notebook
---
<!-- iris dataframe -->
```{r}
iris
```
```{r}
dt <- iris
```

```{r}
View(data)  # to view a dataframe
```

```{r}
data()  # list all of built-in datasets in R
```
<!-- CREATE A DATAFRAME -->
```{r}
df <- data.frame(Name=c('Ali', 'Bob'), # a column name and value
                 Age=c(20, 30),        # a column name and value
                 PassExam=c(TRUE, FALSE)
                 )
df
```
```{r}
df['Name']   # return a dataframe
```
```{r}
class(df['Name'])   # === type() in Python
```

```{r}
df$Name  # === df[['Name']]: return a vector. 
```

```{r}
class(df$Name) # === class(df[['Name']])
```
```{r}
df[1:2]   # access column 1 to 2
```
```{r}
df[c(1, 3)]   # access column 1 and 3 only
```
```{r}
df[2, ]  # access row 2 and all columns
```
<!-- Helper functions -->
```{r}
df <- data.frame(id = letters[1:4], x = 1:4, y = 5:8)
df
```
```{r}
length(df)
```
```{r}
head(df, n = 3)  # get first 3 rows
```
```{r}
tail(df, n = 3)
```
```{r}
dim(df)  # nrow(df), ncol(df)
```
```{r}
str(df)  # structure of df
```
```{r}
names(df)  # === list(df.columns) in pandas
```
```{r}
# For example having a df with a column name "marker"
row.names(df) <- df$marker # set "marker" as a index column in Pandas
df <- subset(df, select = -c(marker)) # drop the "marker" column since it has been use for index
```

```{r}
# Remove row if having any NA
na.omit(df)
```

```{r}
nameVetor = sapply(df, class)  # apply a function (here is class) for each columns
nameVetor  
```
```{r}
class(nameVetor)
```
```{r}
summary(df)
```
<!-- DATAFRAMES: TIBBLES: install.packages("tibble") -->
```{r}
library(tibble)   
```
```{r}
tb <- as_tibble(iris)
tb
```
```{r}
class(tb)
```

```{r}
df <- tibble(id = letters[1:4], 
             x = 1:4, 
             y = 9,  # tibble will make it a vector with the same length
             z = x ^ 2 + y,  # calculate "z" column,
             test = rep(5, 4)
             )
df
```
```{r}
df1 = tribble(
   ~x, ~y, ~z,   # make column name x, y, z
   'a', 1, TRUE,
   'b', 2, FALSE
)
df1
```
```{r}
# Lubridate:  install.packages("tidyverse") or install.packages("lubridate")
df2 <- tibble(
  a = lubridate:: now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1: 1e3,
  d = runif(1e3),  # runif: random uniform [0, 1]
  e = sample(letters, 1e3, replace=TRUE)
)
df2
```
```{r}
print(df2, 
      n=2,   # number of rows - will take effect in console
      width=Inf  # width of table. Eg. width=100
      )   
```

=======
---
title: "R Notebook"
output: html_notebook
---
<!-- iris dataframe -->
```{r}
iris
```
```{r}
dt <- iris
```

```{r}
View(data)  # to view a dataframe
```

```{r}
data()  # list all of built-in datasets in R
```
<!-- CREATE A DATAFRAME -->
```{r}
df <- data.frame(Name=c('Ali', 'Bob'), # a column name and value
                 Age=c(20, 30),        # a column name and value
                 PassExam=c(TRUE, FALSE)
                 )
df
```
```{r}
df['Name']   # return a dataframe
```
```{r}
class(df['Name'])   # === type() in Python
```

```{r}
df$Name  # === df[['Name']]: return a vector. 
```

```{r}
class(df$Name) # === class(df[['Name']])
```
```{r}
df[1:2]   # access column 1 to 2
```
```{r}
df[c(1, 3)]   # access column 1 and 3 only
```
```{r}
df[2, ]  # access row 2 and all columns
```
<!-- Helper functions -->
```{r}
df <- data.frame(id = letters[1:4], x = 1:4, y = 5:8)
df
```
```{r}
length(df)
```
```{r}
head(df, n = 3)  # get first 3 rows
```
```{r}
tail(df, n = 3)
```
```{r}
dim(df)  # nrow(df), ncol(df)
```
```{r}
str(df)  # structure of df
```
```{r}
names(df)  # === list(df.columns) in pandas
```
```{r}
rownames(df)   # like df.index in pandas
```
```{r}
nameVetor = sapply(df, class)  # apply a function (here is class) for each columns
nameVetor  
```
```{r}
class(nameVetor)
```
```{r}
summary(df)
```
<!-- DATAFRAMES: TIBBLES: install.packages("tibble") -->
```{r}
library(tibble)   
```
```{r}
tb <- as_tibble(iris)
tb
```
```{r}
class(tb)
```

```{r}
df <- tibble(id = letters[1:4], 
             x = 1:4, 
             y = 9,  # tibble will make it a vector with the same length
             z = x ^ 2 + y,  # calculate "z" column,
             test = rep(5, 4)
             )
df
```
```{r}
df1 = tribble(
   ~x, ~y, ~z,   # make column name x, y, z
   'a', 1, TRUE,
   'b', 2, FALSE
)
df1
```
```{r}
# Lubridate:  install.packages("tidyverse") or install.packages("lubridate")
df2 <- tibble(
  a = lubridate:: now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1: 1e3,
  d = runif(1e3),  # runif: random uniform [0, 1]
  e = sample(letters, 1e3, replace=TRUE)
)
df2
```
```{r}
print(df2, 
      n=2,   # number of rows - will take effect in console
      width=Inf  # width of table. Eg. width=100
      )   
```

>>>>>>> 62dd2fc954b0badfd36ff6325d5ef87f8fb7451d

---
# install.packages("readr")  # Eg. read_csv(), write_csv()
# install.packages("readxl") # Eg. read_xlsx()
# install.packages("httr")   # Eg. GET()
---
```{r}
library(readr)
library(readxl)
library(httr)
```
```{r}
df <- read.csv("test.csv")
df
```
```{r}
write.csv(iris, 'iris.csv')
```
```{r}
read_xlsx("test.xlsx")
```
```{r}
GET("tidyverse.org")
```