This repository has been archived by the owner on Oct 24, 2023. It is now read-only.
forked from Edouard-Legoupil/koboloadeR
/
kobo_split_multiple.R
123 lines (96 loc) · 4.36 KB
/
kobo_split_multiple.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#' @name kobo_split_multiple
#' @rdname kobo_split_multiple
#' @title Split variables resulting from select_multiple questions
#'
#' @description To be used when extracting from ODK that does not offers splitting capacity
#'
#'
#' @param data Dataframe with selectmultiple column to split
#' @param dico Data dictionnary generated from kobo_dico
#'
#' @return data A "data.table" with the full splitted select_multiple.
#'
#' @author Edouard Legoupil
#'
#'
#' @export kobo_split_multiple
#'
#' @examples
#' \dontrun{
#' kobo_split_multiple(data, dico)
#' }
#'
kobo_split_multiple <- function(data, dico) {
## list fields that have select multiple in the dico - select_multiple_d
selectdf <- dico[dico$type == "select_multiple_d", c("fullname","listname","label","name","variable","disaggregation")]
#rm(datalabel)
# data <- household
# data <- data.or
data <- data %>% dplyr::select(-dplyr::starts_with(paste0(selectdf$fullname, ".")))
datalabeldf <- as.data.frame( names(data))
data <- as.data.frame(data)
names(datalabeldf )[1] <- "fullname"
datalabeldf$fullname <- as.character(datalabeldf$fullname)
datalabeldf$id <- row.names(datalabeldf)
datalabeldf <- plyr::join(x = datalabeldf,y = selectdf,by = "fullname",type = "right")
## Eliminate record from the wrong frame -i.e. id is NULL -
datalabeldf <- datalabeldf[ !(is.na(datalabeldf$id)), ]
## Stop here if no select_multiple to split
if ( nrow(datalabeldf) == 0 ) {
cat("No match for Select multiple variable in your dataset! \n")
cat("You may double check that the form and the data are matching \n")
cat("Double check as well that you did download the data with the correct header (i.e. full path with point delimiters) \n")
return(data)
} else {
## Check if those select_multiple_d have corresponding select_multiple
## Now create the unique select_multiple and append to the dataframe
for (i in 1:nrow(datalabeldf) ) {
# i <- 6
fullname <- as.character(datalabeldf[i,1])
id <- as.integer(as.character(datalabeldf[i,2]))
cat(paste0(i, " - Splitting variable ", fullname, " in column: ", id, "\n"))
data[ , id] <- as.character(data[ , id])
## Account non answered - could be recognised either as null or na...
# nrow(data[data[ , id]=='', id])
data[is.na(data[[id]]), id] <- "zNotAnswered"
data[data[[id]] =='', id] <- "zNotAnswered"
#levels(as.factor(data[ , id]))
#levels(data[ , id])
list <- as.data.frame(data[[id]])
## thanks to: https://stackoverflow.com/questions/44232180/list-to-dataframe
tosplitlist <- strsplit(as.character(data[ , id]), " ")
tosplitlist <- stats::setNames(tosplitlist, seq_along(tosplitlist))
tosplitlist2 <- utils::stack(tosplitlist)
tosplitframe <- reshape2::dcast(tosplitlist2, ind ~ values, value.var = "ind", fun.aggregate = length)
if (ncol(tosplitframe) == 3 ) {
cat(paste0("There was only one modality selected for this select_multiple question in the whole dataset. \n"))
} else {
cat(paste0("There was ", ncol(tosplitframe) - 2 , " different modalities for that question. \n")) }
for (h in 2:ncol(tosplitframe) ) { tosplitframe[tosplitframe$zNotAnswered == 1, h] <- "Not replied"}
drops <- c("ind", "zNotAnswered")
tosplitframe <- as.data.frame(tosplitframe[ , !(names(tosplitframe) %in% drops)])
## Rename the variable to match with dictionnary
datalabelframe <- as.data.frame( names(tosplitframe))
names(datalabelframe )[1] <- "nameor"
datalabelframe$nameor <- as.character(datalabelframe$nameor)
## Handling the case where no one replied to that question;
if (nrow(datalabelframe) == 0) {
cat("There was no recorded reponses for this question...\n")
} else {
cat("Spliting now!\n")
## new variables name without /
datalabelframe$namenew <- paste(fullname, datalabelframe$nameor, sep = ".")
## let's recode the variable of the dataset using short label - column 3 of my reviewed labels
names(tosplitframe) <- datalabelframe[, 2]
## Bind to original data
cat(paste0("Number of columns: ", ncol(data), ", number of additional splitted variables:",ncol(tosplitframe), "\n"))
data <- cbind(data, tosplitframe )
cat(paste0("After binding Number of columns: ", ncol(data), "\n"))
}
rm(tosplitframe,tosplitlist,datalabelframe)
}
#rm(selectdf,datalabeldf)
return(data)
}
}
NULL