In [1]:
library(dplyr)
library(data.table)

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'data.table' was built under R version 3.6.3"
Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last



## I. Dataframe basics

In [2]:
df <- data.frame(label = c('a1', 'a2', 'a3', 'a4', 'a5'),
                 x1 = c(1, 2, 3, NA, NA), 
                 x2 = c(100, NA, 300, 400, NA))
df

label,x1,x2
a1,1.0,100.0
a2,2.0,
a3,3.0,300.0
a4,,400.0
a5,,


### Get unique values of a column

In [3]:
labels = unique(df$label)
labels
class(labels)

### Access df by column names

In [4]:
df_sub2 <- df[c('x1', 'x2')]
df_sub2

x1,x2
1.0,100.0
2.0,
3.0,300.0
,400.0
,


In [5]:
df_sub1 <- df['x1']
df_sub1
class(df_sub1)

x1
1.0
2.0
3.0
""
""


### Set index for df instead of integer index

In [6]:
row.names(df) <- df$label
df

Unnamed: 0,label,x1,x2
a1,a1,1.0,100.0
a2,a2,2.0,
a3,a3,3.0,300.0
a4,a4,,400.0
a5,a5,,


### Remove a column in df

In [7]:
df <- subset(df, select = -c(label))
df

Unnamed: 0,x1,x2
a1,1.0,100.0
a2,2.0,
a3,3.0,300.0
a4,,400.0
a5,,


### remove "rows" in a dataframe with a NA value in one of the columns

In [8]:
df1 = na.omit(df)
df1

Unnamed: 0,x1,x2
a1,1,100
a3,3,300


### remove "rows" in a dataframe with all NA

In [9]:
df %>% filter_all(any_vars(!is.na(.)))

Unnamed: 0,x1,x2
a1,1.0,100.0
a2,2.0,
a3,3.0,300.0
a4,,400.0


### Get column names

In [10]:
# col_names = colnames(df)
col_names = names(df)
col_names
class(col_names)

### Create list of dataframe

In [11]:
df1 <- data.frame(value = c(1,4,3), name = c(1,1,1))
df2 <- data.frame(value = c(4,3,7), name = c(1,0,1))

# Explicit way to make a list of df
list_df <- list(df1, df2)
list_df

value,name
1,1
4,1
3,1

value,name
4,1
3,0
7,1


In [12]:
# Append() to make a list of df
vec = c()
vec <- append(vec, list(df1))
vec
vec <- append(vec, list(df2))
vec

value,name
1,1
4,1
3,1


value,name
1,1
4,1
3,1

value,name
4,1
3,0
7,1


In [13]:
# Concat list of df to be one df. rbind() require all df has the same column names
do.call("rbind", vec)

value,name
1,1
4,1
3,1
4,1
3,0
7,1


### get column names where it has less than 2 real values

In [14]:
len <- function(x) { return (length(x[!is.na(x)])) } 

df <- data.frame(value1 = c(4,3,7), value2 = c(NA, 0,NA), value3 = c(4,NA,7), value4=c(NA,NA,NA))
df

value1,value2,value3,value4
4,,4.0,
3,0.0,,
7,,7.0,


In [15]:
vec_len_smaller_2 <- c()
for (col in colnames(df)){
    if (len(df[[col]]) < 2){
        vec_len_smaller_2 <- c(vec_len_smaller_2, col)
    }
}
vec_len_smaller_2

## II. Transform multi-level JSON to Dataframe

### 1. Make a raw json string for testing

In [16]:
library(jsonlite)

json_str <- '{
    "_id" : "AAA1",
    "individualCode" : "HBD001",
    "Samples" : [ 
        {
            "filename" : "PBMCs_APC HBD001 ACD-A 17apr18_088․fcs",
            "project" : "CPI",
            "markers" : [ 
                {
                    "name" : "NK (%LC)",
                    "value" : 17.92,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "NK- 1 (%LC)",
                    "value" : 1.35,
                    "confidence" : "",
                    "interpretation" : ""
                }              
            ]
        }, 
        {
            "filename" : "PBMCs_Th cell HBD001 ACD-A 18apr18_045․fcs",
            "project" : "CPI",
            "markers" : [ 
                {
                    "name" : "Exhausted (%CD4)",
                    "value" : 0.29,
                    "confidence" : "",
                    "interpretation" : ""
                }, 
                {
                    "name" : "R5 Th1 (%CD4)",
                    "value" : 1.23,
                    "confidence" : "",
                    "interpretation" : ""
                }
            ]
        }
    ]
}'

json_str

### 2. Convert json string to R object

In [17]:
list_json <- fromJSON(json_str)
list_json

filename,project,markers
PBMCs_APC HBD001 ACD-A 17apr18_088·fcs,CPI,"NK (%LC) , NK- 1 (%LC), 17.92 , 1.35 , , , ,"
PBMCs_Th cell HBD001 ACD-A 18apr18_045·fcs,CPI,"Exhausted (%CD4), R5 Th1 (%CD4) , 0.29 , 1.23 , , , ,"


### 3. Convert R list object to DataFrame

In [18]:
df <- do.call("cbind", list_json)
df

_id,individualCode,Samples.filename,Samples.project,Samples.markers
AAA1,HBD001,PBMCs_APC HBD001 ACD-A 17apr18_088·fcs,CPI,"NK (%LC) , NK- 1 (%LC), 17.92 , 1.35 , , , ,"
AAA1,HBD001,PBMCs_Th cell HBD001 ACD-A 18apr18_045·fcs,CPI,"Exhausted (%CD4), R5 Th1 (%CD4) , 0.29 , 1.23 , , , ,"


### 4.  DataFrame of markers (Optional)

In [19]:
# Check
list_markers <- df$Samples.markers
list_markers

name,value,confidence,interpretation
NK (%LC),17.92,,
NK- 1 (%LC),1.35,,

name,value,confidence,interpretation
Exhausted (%CD4),0.29,,
R5 Th1 (%CD4),1.23,,


In [20]:
library(data.table)

df_markers <- rbindlist(list_markers, fill=TRUE)
df_markers

name,value,confidence,interpretation
NK (%LC),17.92,,
NK- 1 (%LC),1.35,,
Exhausted (%CD4),0.29,,
R5 Th1 (%CD4),1.23,,


In [21]:
# at df_markers: change column name to HBD001
colnames(df_markers)[colnames(df_markers) == 'value'] <- 'HBD001'
df_markers

name,HBD001,confidence,interpretation
NK (%LC),17.92,,
NK- 1 (%LC),1.35,,
Exhausted (%CD4),0.29,,
R5 Th1 (%CD4),1.23,,


### Create a function

In [22]:
transform_df <- function(df, newColname){
    list_Samples <- df$Samples
    df_Samples <- rbindlist(list_Samples, fill=TRUE)
    list_markers <- df_Samples$markers
    df_markers <- rbindlist(list_markers, fill=TRUE)
    colnames(df_markers)[colnames(df_markers) == 'value'] <- newColname
    return (df_markers)
}

## III. Merge two Dataframes  
```
cbind() – combining the columns of two data frames side-by-side
rbind() – stacking two data frames on top of each other, appending one to the other
merge() – joining two data frames using a common column
```

### 1. Merge

In [23]:
df_authors <- data.frame(
    surname = c("AA", "BB", "EE"),
    nationality = c("US", "Australia", "US"),
    retired = c("yes", rep("no", 2)))
df_authors

surname,nationality,retired
AA,US,yes
BB,Australia,no
EE,US,no


In [24]:
df_books <- data.frame(
    name = c("AA", "DD", "BB"),
    title = c("Title1", NA, "Title3"))
df_books

name,title
AA,Title1
DD,
BB,Title3


In [25]:
df_merge <- merge(x=df_authors, 
                  y=df_books, 
                  by.x="surname", 
                  by.y="name",
                  all=TRUE  # ALL: Outer Join, all.x= TRUE: Left Join, all.y=TRUE: Right join, No specify: Inner join
                            # NOTE: Cross join: merge(x = df1, y = df2, by = NULL)
                 )
df_merge

surname,nationality,retired,title
AA,US,yes,Title1
BB,Australia,no,Title3
EE,US,no,
DD,,,


### 2. rbind and bind_rows

In [26]:
df1 <- data.frame(a=c(0,1,2), b=c(3,4,5), c=c(6,7,8))
df1

a,b,c
0,3,6
1,4,7
2,5,8


In [27]:
df2 <- data.frame(a=c(9,10), c=c(12,13))
df2

a,c
9,12
10,13


In [28]:
df2$b <- NA  # make a new column with all NA
df2

a,c,b
9,12,
10,13,


In [29]:
rbind(df1, df2)   # rbind() requires df1, df2 have the same columns

a,b,c
0,3.0,6
1,4.0,7
2,5.0,8
9,,12
10,,13


In [30]:
df3 <- data.frame(a=c(9,10), c=c(12,13))
df3

a,c
9,12
10,13


In [31]:
bind_rows(df1, df3)  # from the dplyr library

a,b,c
0,3.0,6
1,4.0,7
2,5.0,8
9,,12
10,,13


## IV. filter dataframe

In [32]:
library(dplyr)

df <- cbind(expand.grid(sciName=list("A", "B", "C"), family=list("X", "Y"), stage=list("S1", "S2", "S3", "S4")), count=1)
df

sciName,family,stage,count
A,X,S1,1
B,X,S1,1
C,X,S1,1
A,Y,S1,1
B,Y,S1,1
C,Y,S1,1
A,X,S2,1
B,X,S2,1
C,X,S2,1
A,Y,S2,1


In [33]:
# set up our filter conditions
condition1 <- list(sciName="A", stageVector=c("S2", "S3"))
condition2 <- list(sciName="C", stageVector=c("S3", "S4"))
conditionList <- list(condition1, condition2)
conditionList

In [34]:
# Define the filtering function
filterStages <- function(condition, df) {
    subset.data <- df %>%
        filter(sciName == condition$sciName) %>%
        filter(stage %in% condition$stageVector)
    return(subset.data)
}

In [35]:
# demo the filter function working on a single condition at a time
filterStages(condition1, df)
filterStages(condition2, df)

sciName,family,stage,count
A,X,S2,1
A,Y,S2,1
A,X,S3,1
A,Y,S3,1


sciName,family,stage,count
C,X,S3,1
C,Y,S3,1
C,X,S4,1
C,Y,S4,1


In [36]:
# demo the filter function working over a list of conditions
resultDataList <- lapply(conditionList, filterStages, df)
resultDataList

sciName,family,stage,count
A,X,S2,1
A,Y,S2,1
A,X,S3,1
A,Y,S3,1

sciName,family,stage,count
C,X,S3,1
C,Y,S3,1
C,X,S4,1
C,Y,S4,1


## V. Aggregate df

In [37]:
df <- data.frame(Company = c('A', 'A', 'B', 'C', 'A', 'B', 'B', 'C', 'C'), 
                 Name = c("Wayne", "Wayne", "William", "Rafael", "John", "Eric", "James", "Pablo", "Tammy"), 
                 Age = c(26, 27, 28, 32, 28, 24, 34, 30, 25), 
                 Wages = c(50000, 70000, 70000, 60000, 50000, 70000, 65000, 50000, 50000), 
                 Education.University = c(1, 1, 1, 0, 0, 1, 1, 0, 1), 
                 Productivity = c(100, 120, 120, 95, 88, 115, 100, 90, 120))
df

Company,Name,Age,Wages,Education.University,Productivity
A,Wayne,26,50000,1,100
A,Wayne,27,70000,1,120
B,William,28,70000,1,120
C,Rafael,32,60000,0,95
A,John,28,50000,0,88
B,Eric,24,70000,1,115
B,James,34,65000,1,100
C,Pablo,30,50000,0,90
C,Tammy,25,50000,1,120


### group_by and summarise

In [38]:
df_summary <- df %>% 
                group_by(Company) %>% 
                summarise(Age = mean(Age), 
                          Wages = mean(Wages), 
                          Education.University = sum(Education.University), 
                          Productivity = mean(Productivity))
df_summary

Company,Age,Wages,Education.University,Productivity
A,27.0,56666.67,2,102.6667
B,28.66667,68333.33,3,111.6667
C,29.0,53333.33,1,101.6667


### group_by_at, vars and summarise

In [39]:
df_summary_1 <- df %>%
                group_by_at(vars(one_of(c("Company", "Name")))) %>%
                summarize(Age = mean(Age))
df_summary_1

`summarise()` has grouped output by 'Company'. You can override using the `.groups` argument.


Company,Name,Age
A,John,28.0
A,Wayne,26.5
B,Eric,24.0
B,James,34.0
B,William,28.0
C,Pablo,30.0
C,Rafael,32.0
C,Tammy,25.0


### Aggregate

In [40]:
aggregate(x = df[c("Age","Wages","Education.University","Productivity")], 
          by = df[c("Company", "Name")], 
          FUN = mean)

Company,Name,Age,Wages,Education.University,Productivity
B,Eric,24.0,70000,1,115
B,James,34.0,65000,1,100
A,John,28.0,50000,0,88
C,Pablo,30.0,50000,0,90
C,Rafael,32.0,60000,0,95
C,Tammy,25.0,50000,1,120
A,Wayne,26.5,60000,1,110
B,William,28.0,70000,1,120


## VI. Pivot and Transpose dataframe

In [41]:
# install.packages("tidyverse")
# library(tidyverse)

In [42]:
available.packages()["tidyr",]
package_version(R.version)
# print(sessionInfo())
# print(version)

[1] '3.6.1'

In [43]:
# install.packages("tidyr")
library(tidyr)
library(dplyr)

"package 'tidyr' was built under R version 3.6.3"

In [44]:
NUM <- c("45", "45", "45", "45", "48", "50", "66", "66", "66", "68")
Type <- c("A", "F", "C", "B", "D", "A", "E", "C", "F", "D")
Points <- c(9.2,60.8,22.9,1012.7,18.7,11.1,67.2,63.1,16.7,58.4)
df <- data.frame(NUM, Type, Points)
df

NUM,Type,Points
45,A,9.2
45,F,60.8
45,C,22.9
45,B,1012.7
48,D,18.7
50,A,11.1
66,E,67.2
66,C,63.1
66,F,16.7
68,D,58.4


### pivot_wider() function

In [45]:
df %>%
    pivot_wider(names_from = Type, values_from = Points)

NUM,A,F,C,B,D,E
45,9.2,60.8,22.9,1012.7,,
48,,,,,18.7,
50,11.1,,,,,
66,,16.7,63.1,,,67.2
68,,,,,58.4,


### Transpose df

In [46]:
data <- read.table(text="X Y    Z
                   ID12   2012-06    566
                   ID1    2012-06  10239
                   ID6    2012-06    524
                   ID12   2012-07   2360
                   ID1    2012-07   13853
                   ID6    2012-07    2352
                   ID12   2012-08   3950
                   ID1    2012-08   14738
                   ID6    2012-08   4104",header=TRUE)
rownames(data)
colnames(data)
data

X,Y,Z
ID12,2012-06,566
ID1,2012-06,10239
ID6,2012-06,524
ID12,2012-07,2360
ID1,2012-07,13853
ID6,2012-07,2352
ID12,2012-08,3950
ID1,2012-08,14738
ID6,2012-08,4104


In [47]:
data[c("Y", "Z")]

Y,Z
2012-06,566
2012-06,10239
2012-06,524
2012-07,2360
2012-07,13853
2012-07,2352
2012-08,3950
2012-08,14738
2012-08,4104


In [48]:
transpose(data, fill=NA, ignore.empty=FALSE, keep.names="marker", make.names="X")

marker,ID12,ID1,ID6,ID12.1,ID1.1,ID6.1,ID12.2,ID1.2,ID6.2
Y,2012-06,2012-06,2012-06,2012-07,2012-07,2012-07,2012-08,2012-08,2012-08
Z,566,10239,524,2360,13853,2352,3950,14738,4104


# VI. Percentile

### Percentile example

```
Note: method == 'Pandas' is equivalent to the below code.
    Just difference with JCSMR in pct_increment = 1. / length instead of  1. / (length + 1)
    length = len([v for v in df['Values'] if not nan_none_empty_str(v)])
    pct_increment = 1. / length
    df['Rank_Percentile_Manual'] = df.Rank * pct_increment
Eg. df = pd.DataFrame({'Values': [119, np.nan, 80, 50, 120, 90, 119]}).sort_values('Values')
    # Output below is rank max for ties: df['Rank'] = df.rank(method='max')  # average, min, max
        Values	Rank	Rank_Percentile_Pandas	Rank_Percentile_Manual	Rank_Percentile_JCSMR
    0	50.0	1.0	            0.166667	            0.166667	        0.142857
    1	80.0	2.0	            0.333333	            0.333333	        0.285714
    2	90.0	3.0	            0.500000	            0.500000	        0.428571
    3	119.0	5.0	            0.833333	            0.833333	        0.714286
    4	119.0	5.0	            0.833333	            0.833333	        0.714286
    5	120.0	6.0	            1.000000	            1.000000	        0.857143
    6	NaN	   NaN	            NaN	                    NaN	                NaN
```

In [49]:
# Calculate percentile of a value w.r.t a vector
percentile_norm <- function(value, vec, method='JCSMR'){
   # method == 'JCSMR' or a normal way
   if (method == 'JCSMR'){
       epsilon <- 1e-6
       maxVal <- max(vec, na.rm = TRUE)
       vec <- c(vec, maxVal + epsilon)
   }   
   return (ecdf(vec)(value))   
} 

vec <- c(50, 80, 90, 119, 119, 120, NA)
# min(vec, na.rm = TRUE) # max(vec, na.rm = TRUE)
lapply(c(50, 80, 90, 119, 119, 120, NA), percentile_norm, vec)

In [50]:
# Calculate percentile of a value w.r.t a vector
percentile_norm <- function(value, vec, method='JCSMR'){
   # method == 'JCSMR' or a normal way
   if (method == 'JCSMR'){
       epsilon <- 1e-6
       maxVal <- max(vec, na.rm = TRUE)
       vec <- c(vec, maxVal + epsilon)
   }   
   return (ecdf(vec)(value))   
} 

# df_data: At "Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"
#          has many values 0.33, 0.97, 0.41, 0.31, 2.15, 1.21, 0.64, 0.7, 1.09, 0.93, 
#                          0.85, 1.82, 0.93, 6.35, 0.68

# df_control: at "Single Cells/Single Cells/live/LC and Mono/LC/non BT/CD15neg/NK 1 | Freqâ€¤ of LC"
#             Only HBD063 has value =0.9 others are NA

vec <- c(NA, 0.9, NA)  # control data
lapply(c(0.33, 0.97, 0.41, 0.31, 2.15, 1.21, 0.64, 0.7, 1.09, 0.93, 0.85, 1.82, 0.93, 6.35, 0.68), # real data
       percentile_norm, 
       vec  # # control data
      )

In [51]:
df_test <- data.frame(Values = c(50, 80, 90, 119, 119, 120, NA))
df_test.percentile <- df_test %>% 
    mutate(Rank = rank(Values, na.last = 'keep', ties.method = c("average")))  %>% # NA rank last
    # rank(x, na.last = TRUE,
    # ties.method = c("average", "first", "last", "random", "max", "min"))

    # (rank of row in its partition - 1) / (number of rows in the partition - 1)
    mutate(Rank_Percentile = percent_rank(Values)) %>%
    mutate(Rank_Percentile_manual = percent_rank(Rank)) %>%
    mutate(PCT_wo_ties = ntile(Values, 100)) %>% # # percentiles
    mutate(Rank_Percentile_JCSMR = rank(Values, na.last = 'keep', ties.method = c("average")) * (1. / (len(df_control$Values) + 1))) 
    # mutate(PCT = ntile(Values, 4)) # quartiles 
    # mutate(PCT = ntile(Values, 10))  # deciles
df_test.percentile

ERROR: Error: Problem with `mutate()` column `Rank_Percentile_JCSMR`.
i `Rank_Percentile_JCSMR = *...`.
x object 'df_control' not found


## Control data

In [None]:
Values <- c(50, 80, 90, 119, 119, 120, NA)
Comment <- rep('Control', length(Values))
df_control <- data.frame(Comment, 
                         Values,
                         Values1 = Values,
                         Not_existed_in_df_data = Values
                        )
#df_control
summary(df_control)

In [None]:
# df_control.per <- df_control %>% 
#     mutate(Rank = rank(Values, na.last = 'keep', ties.method = c("average")))  %>% # NA rank last
#     mutate(Rank_Percentile_JCSMR = per_JCSMR(df_control$Values)) 
# df_control.per

## real data

In [None]:
Values <- c(40, 50, 100, 119, 120, NA, 150) 
Comment <- rep('Data', length(Values))
df_data <- data.frame(Comment, 
                      Values,
                      Values1 = Values,
                      Values_new_1 = Values,
                      Values_new_2 = Values
                     )
df_data
summary(df_data)

In [None]:
len <- function(x) { return (length(x[!is.na(x)])) } 

percentile_JCSMR <- function(vec){
    percentiles <- rank(vec, na.last = 'keep', ties.method = c("average")) * (1. / (len(vec) + 1))
    return (percentiles)
}

percentile <- function(vec){
    percentiles <- rank(vec, na.last = 'keep', ties.method = c("average")) * (1. / len(vec))
    return (percentiles)    
}

# Calculate percentile of a value w.r.t a vector
percentile_norm <- function(value, vec){
  return (ecdf(vec)(value))
} 


# normalize percentile one column
norm_perc_one_col <- function(col_name, df_control, df_data){
    df_data[[col_name]] <- unlist(lapply(df_data[, col_name], percentile_norm, df_control[, col_name]))
    return (df_data)
}


normalize_percentile <- function(df_control, df_data, col_not_cal_per="marker"){
    
    # columns existed both in df_control and df_data
    common_cols <- intersect(names(df_control), names(df_data))
    
    # Get column to calculate normalized percentile (w.r.t control data) 
    selected_cols <- common_cols[common_cols != col_not_cal_per]
    for (col_name in selected_cols){
        df_data[[col_name]] <- unlist(lapply(df_data[, col_name], percentile_norm, df_control[, col_name]))
    }
    
    # columns existed df_data ONLY. Calculate percentile by itself
    cols_in_data_only <- setdiff(names(df_data), names(df_control))
    for (col_name in cols_in_data_only){
        # df_data[[col_name]] <- per_JCSMR(df_data[, col_name])
        df_data[[col_name]] <- percentile(df_data[, col_name])
    }
    
    return (list(df=df_data, cols_in_data_only=cols_in_data_only))  
}

df_data <- normalize_percentile(df_control, df_data)
df_data
df_data$df
df_data$cols_in_data_only

### Draft Code: keeping for references

In [None]:
# remove columns of df_data that is not existed in df_control (since it cannot be normalized)
df_data <- subset(df_data, select = common_cols)

df_data %>%
    mutate(Per1 = lapply(df_data$Values, percentile, df_control$Values)) %>%
    mutate(PerOfValues = unlist(lapply(df_data$Values, percentile, df_control$Values)))  # calculate and overidden to a colum name

# percentile <- ecdf(50:120)
percentile <- ecdf(c(50, 80, 90, 119, 119, 120, NA))
percentile(119) # 0.83
percentile <- ecdf(c(50, 80, 90, 119, 119, 120))
percentile(119) # 0.83
percentile <- ecdf(c(50, 80, 90, 119, 120))
percentile(119) # 0.83

percentile <- ecdf(c(50, 80, 90, 119, 119, 120, NA))
percentile(100) 

percentile <- ecdf(c(50, 80, 90, 119, 119, 120, NA))
percentile(0) 

percentile <- ecdf(c(50, 80, 90, 119, 119, 120, NA))
percentile(-10) 

percentile <- ecdf(c(50, 80, 90, 119, 119, 120, NA))
percentile(121) 

percentile <- ecdf(c(50, 80, 90, 119, 119, 120, NA))
percentile(140) 

findInterval(17, c(4, 8, 15, 16, 23, 42))
findInterval(20, c(4, 8, 15, 16, 23, 42))
findInterval(16, c(4, 8, 15, 16, 23, 42))

df_control.per <- df_control %>% 
    mutate(Rank = rank(Values, na.last = 'keep', ties.method = c("average")))  %>% # NA rank last
    # rank(x, na.last = TRUE,
    # ties.method = c("average", "first", "last", "random", "max", "min"))

    # (rank of row in its partition - 1) / (number of rows in the partition - 1)
    mutate(Rank_Percentile = percent_rank(Values)) %>%
    mutate(Rank_Percentile_manual = percent_rank(Rank)) %>%
    mutate(PCT_wo_ties = ntile(Values, 100)) %>% # # percentiles
    mutate(Rank_Percentile_JCSMR = rank(Values, na.last = 'keep', ties.method = c("average")) * (1. / (len(df_control$Values) + 1))) 
    # mutate(PCT = ntile(Values, 4)) # quartiles 
    # mutate(PCT = ntile(Values, 10))  # deciles

<!-- CREATE A DATAFRAME -->
```{r}
df <- data.frame(Name=c('Ali', 'Bob'), # a column name and value
                 Age=c(20, 30),        # a column name and value
                 PassExam=c(TRUE, FALSE)
                 )
df
```
```{r}
df['Name']   # return a dataframe
```
```{r}
class(df['Name'])   # === type() in Python
```

```{r}
df$Name  # === df[['Name']]: return a vector. 
```

```{r}
class(df$Name) # === class(df[['Name']])
```
```{r}
df[1:2]   # access column 1 to 2
```
```{r}
df[c(1, 3)]   # access column 1 and 3 only
```
```{r}
df[2, ]  # access row 2 and all columns
```
<!-- Helper functions -->
```{r}
df <- data.frame(id = letters[1:4], x = 1:4, y = 5:8)
df
```
```{r}
length(df)
```
```{r}
head(df, n = 3)  # get first 3 rows
```
```{r}
tail(df, n = 3)
```
```{r}
dim(df)  # nrow(df), ncol(df)
```
```{r}
str(df)  # structure of df
```
```{r}
names(df)  # === list(df.columns) in pandas
```
```{r}
# For example having a df with a column name "marker"
row.names(df) <- df$marker # set "marker" as a index column in Pandas
df <- subset(df, select = -c(marker)) # drop the "marker" column since it has been use for index
```

```{r}
# Remove row if having any NA
na.omit(df)
```

```{r}
nameVetor = sapply(df, class)  # apply a function (here is class) for each columns
nameVetor  
```
```{r}
class(nameVetor)
```
```{r}
summary(df)
```
<!-- DATAFRAMES: TIBBLES: install.packages("tibble") -->
```{r}
library(tibble)   
```
```{r}
tb <- as_tibble(iris)
tb
```
```{r}
class(tb)
```

```{r}
df <- tibble(id = letters[1:4], 
             x = 1:4, 
             y = 9,  # tibble will make it a vector with the same length
             z = x ^ 2 + y,  # calculate "z" column,
             test = rep(5, 4)
             )
df
```
```{r}
df1 = tribble(
   ~x, ~y, ~z,   # make column name x, y, z
   'a', 1, TRUE,
   'b', 2, FALSE
)
df1
```
```{r}
# Lubridate:  install.packages("tidyverse") or install.packages("lubridate")
df2 <- tibble(
  a = lubridate:: now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1: 1e3,
  d = runif(1e3),  # runif: random uniform [0, 1]
  e = sample(letters, 1e3, replace=TRUE)
)
df2
```
```{r}
print(df2, 
      n=2,   # number of rows - will take effect in console
      width=Inf  # width of table. Eg. width=100
      )   
```

library(readr)
library(readxl)
library(httr)
```
```{r}
df <- read.csv("test.csv")
df
```
```{r}
write.csv(iris, 'iris.csv')
```
```{r}
read_xlsx("test.xlsx")
```
```{r}
GET("tidyverse.org")
```