/
question_type.R
288 lines (286 loc) · 12.2 KB
/
question_type.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#' Count of Question Type
#'
#' Transcript apply question counts.
#'
#' @param text.var The text variable
#' @param grouping.var The grouping variables. Default NULL generates one
#' output for all text. Also takes a single grouping variable or a list of 1 or
#' more grouping variables.
#' @param neg.cont logical. IF TRUE provides separate counts for the negative
#' contraction forms of the interrogative words.
#' @param percent logical. If TRUE output given as percent. If FALSE the
#' output is proportion.
#' @param zero.replace Value to replace 0 values with.
#' @param digits Integer; number of decimal places to round when printing.
#' @return Returns a list of:
#' \item{raw}{A dataframe of the questions used in the transcript and their
#' type.}
#' \item{count}{A dataframe of total questions (\code{tot.quest}) and counts of
#' question types (initial interrogative word) by grouping variable(s).}
#' \item{rnp}{Dataframe of the frequency and proportions of question types by
#' grouping variable.}
#' \item{missing}{The row numbers of the missing data (excluded from analysis).}
#' \item{percent}{The value of percent used for plotting purposes.}
#' \item{zero.replace}{The value of zero.replace used for plotting purposes.}
#' @details The algorithm searches for the following interrogative words (and
#' optionally, their negative contraction form as well):
#'
#' 1) whose 2) whom 3) who 4) where 5) what 6) which 7) why 8) when 9) were
#' 10) was 11) does 12) did 13) do 14) is 15) are 16) will 17) how 18) should
#' 19) could 20) would 21) shall 22) may 23) might 24) must 25) can 26) has
#' 27) have 28) had 29) ok 30) right 31) correct 32) implied do/does
#'
#' The interrogative word that is found first (with the exception of "ok", "right"
#' and "correct") in the question determines the sentence type. "ok", "right" and
#' "correct" sentence types are determined if the sentence is a question with no
#' other interrogative words found and "ok", "right" or "correct" is the last
#' word of the sentence. Those interrogative sentences beginning with the word
#' "you" are categorized as implying do or does question type, though the use of
#' do/does is not explicit. Those with undetermined sentence type are labeled
#' unknown.
#' @keywords question, question-count
#' @export
#' @examples
#' \dontrun{
#' (x <- question_type(DATA$state, DATA$person))
#' x$raw
#' x$count
#' plot(x)
#' plot(x, label = TRUE)
#' plot(x, label = TRUE, text.color = "red")
#' question_type(DATA$state, DATA$person, percent = FALSE)
#' DATA[8, 4] <- "Won't I distrust you?"
#' question_type(DATA$state, DATA$person)
#' DATA <- qdap::DATA
#' with(DATA, question_type(state, list(sex, adult)))
#'
#' with(mraja1spl, question_type(dialogue, person))
#' with(mraja1spl, question_type(dialogue, list(sex, fam.aff)))
#' (x <- with(mraja1spl, question_type(dialogue, list(sex, fam.aff),
#' percent = FALSE)))
#' plot(x)
#' }
question_type <- function(text.var, grouping.var = NULL,
neg.cont = FALSE, percent = TRUE, zero.replace = 0, digits = 2) {
if(is.null(grouping.var)) {
G <- "all"
} else {
if (is.list(grouping.var)) {
m <- unlist(as.character(substitute(grouping.var))[-1])
m <- sapply(strsplit(m, "$", fixed=TRUE), function(x) {
x[length(x)]
}
)
G <- paste(m, collapse="&")
} else {
G <- as.character(substitute(grouping.var))
G <- G[length(G)]
}
}
if(is.null(grouping.var)){
grouping <- rep("all", length(text.var))
} else {
if (is.list(grouping.var) & length(grouping.var)>1) {
grouping <- paste2(grouping.var)
} else {
grouping <- unlist(grouping.var)
}
}
text.var <- replace_contraction(as.character(text.var),
qdap::contractions[grepl("you", qdap::contractions[, 1]), ])
DF <- data.frame(grouping, text.var, check.names = FALSE,
stringsAsFactors = FALSE, orig.row.num = seq_len(length(text.var)))
DF$grouping <- factor(DF$grouping)
if (is.dp(text.var=DF[, "text.var"])){
warning(paste0("\n Some rows contain double punctuation.",
" Suggested use of sentSplit function."))
}
DF[, "end.mark"] <- substring(DF[, "text.var"], nchar(DF[, "text.var"]))
DF[, "stext.var"] <- spaste(strip(gsub("'s ", " ", DF[, "text.var"])))
if (sum(DF$end.mark == "?", na.rm = TRUE) == 0) stop("No questions found")
rows.removed <- which(is.na(DF$end.mark))
DF <- DF[!is.na(DF$end.mark), ]
DF <- DF[DF$end.mark == "?", ]
L1 <- split(DF, DF[, "grouping"])
missing <- names(L1)[sapply(L1, nrow) == 0]
L1 <- L1[sapply(L1, nrow) != 0]
x <- c("whose", "whom", "who", "where", "what",
"which", "why", "when", "werent", "were", "wasnt", "was", "doesnt",
"does", "didnt", "did", "dont", "do", "isnt","is", "arent", "are",
"will", "wont", "how", "shouldnt", "should", "couldnt", "could",
"wouldnt", "would", "shall", "may", "might", "must", "cant", "can",
"hasnt", "has", "havent", "have", "hadnt", "had")
y <- paste0(" XXXXX", sprintf("%02d", seq_along(x)), " ")
key <- data.frame(x = spaste(x), y = y, stringsAsFactors = FALSE)
L1 <- lapply(L1, function(x){
z <- x[, "stext.var"]
y <- nchar(z)
a1 <- (y-4) == sapply(gregexpr("okay", z), "[", 1)
a2 <- (y-2) == sapply(gregexpr("ok", z), "[", 1)
x[, "ok"] <- a1 + a2
x[, "alright"] <- (y-7) == sapply(gregexpr("alright", z), "[", 1)
x[, " right"] <- (y-6) == sapply(gregexpr("right", z), "[", 1)
x[, "correct"] <- (y-7) == sapply(gregexpr("correct", z), "[", 1)
x[, "huh"] <- (y-3) == sapply(gregexpr("huh", z), "[", 1)
x[, "implied_do/does"] <- sapply(gregexpr("you", z), "[", 1) == 2
x
})
L2 <- invisible(lapply(L1, function(x) {
subtext <- mgsub(key[, "x"], key[, "y"], x[, "stext.var"])
gsub("\\s+", " ", (Trim(gsub("[^XXX[:digit:]]", " ", subtext))))
}))
L2 <- invisible(lapply(L2, function(x) {
sapply(stopwords(x, stopwords = NULL, ignore.case = FALSE), "[", 1)
}))
key <- apply(key, 2, Trim)
L2 <- lapply(L2, lookup, key.match = key[, 2:1], missing = "unknown")
L2 <- lapply(seq_along(L2), function(i) {
unels <- L2[[i]] == "unknown"
L2[[i]][unels & L1[[i]][, "ok"]] <- "ok"
L2[[i]][unels & L1[[i]][, "alright"]] <- "alright"
L2[[i]][unels & L1[[i]][, " right"]] <- "right"
L2[[i]][unels & L1[[i]][, "correct"]] <- "correct"
L2[[i]][unels & L1[[i]][, "huh"]] <- "huh"
L2[[i]]
})
DF3a <- data.frame(ords = unlist(lapply(L1, "[", "orig.row.num")),
q.type = unlist(L2), stringsAsFactors = FALSE)
DF3a[unlist(lapply(L1, "[", "implied_do/does")), 2] <- "implied_do/does"
DF3 <- data.frame(DF, q.type = DF3a[order(DF3a[, "ords"]), 2])
names(DF3) <- c(G, "raw.text", "n.row", "endmark", "strip.text", "q.type")
unL2 <- unlist(L2)
unL2[unlist(lapply(L1, "[", "implied_do/does"))] <- "idd"
WFM <- t(wfm(unL2, rep(names(L1), sapply(L2, length))))
cols <- c(key[, "x"], "ok", "alright", "right", "correct", "huh", "idd", "unknown")
cols2 <- cols[cols %in% colnames(WFM)]
WFM <- WFM[, cols2, drop = FALSE]
if (all(grouping %in% "all")) {
DF <- as.data.frame(matrix(WFM, nrow = 1))
colnames(DF) <- colnames(WFM)
rownames(DF) <- "all"
} else {
grvar <- levels(DF[, "grouping"])
grvarNA <- grvar[!grvar %in% rownames(WFM)]
mat <- matrix(rep(0, length(grvarNA)*ncol(WFM)), ncol = ncol(WFM))
dimnames(mat) <- list(grvarNA, colnames(WFM))
DF <- data.frame(rbind(WFM, mat))
}
tq <- rowSums(DF)
comdcol <- list(
were = c("weren't", "were"),
was = c("wasn't", "was"),
does = c("doesn't", "does"),
did = c("didn't", "did"),
do = c("don't", "do"),
is = c("isn't","is"),
are = c("aren't", "are"),
will = c("won't", "will"),
should = c("shouldn't", "should"),
could = c("couldn't", "could"),
would = c("wouldn't", "would"),
can = c("can't", "can"),
has = c("hasn't", "has"),
have = c("haven't", "have"),
had = c("hadn't", "had")
)
if(!neg.cont & ncol(DF) > 1) {
ord <- c("whose", "whom", "who", "where", "what", "which",
"why", "when", "were", "was", "does", "did", "do",
"is", "are", "will", "how", "should", "could", "would",
"shall", "may", "might", "must", "can", "has", "have", "had")
comdcol <- lapply(comdcol, function(x) gsub("'", "", x))
DF <- qcombine(DF, comdcol)
ord <- c(ord[ord %in% colnames(DF)], "ok", "alright", "right",
"correct", "huh", "idd", "unknown")
DF <- DF[, ord[ord %in% colnames(DF)]]
}
colnames(DF)[colnames(DF) == "idd"] <- "implied_do/does"
DF <- data.frame(group=rownames(DF), tot.quest = tq, DF, row.names = NULL,
check.names = FALSE)
if(ncol(DF) == 3) {
warning(paste0("Text does not contain enough questions to give",
"an output of the class \"question_type\":\n",
" ...only counts are returned"))
return(DF)
}
DF <- DF[sort(DF[, "group"]), ]
colnames(DF)[1] <- G
yesap <- sapply(comdcol, "[", 1)
noap <- gsub("'", "", sapply(comdcol, "[", 1))
colnames(DF) <- mgsub(noap, yesap, colnames(DF))
DF2 <- as.matrix(DF[, -c(1:2)]/DF[, 2])
DF2[is.nan(DF2)] <- 0
if (percent) {
DF2 <- DF2*100
}
DF2 <- data.frame(DF[, c(1:2)], as.data.frame(DF2), check.names = FALSE,
row.names = NULL)
rownames(DF) <- NULL
rnp <- raw_pro_comb(DF[, -c(1:2)], DF2[, -c(1:2)], digits = digits,
percent = percent, zero.replace = zero.replace)
rnp <- data.frame(DF2[, 1:2], rnp, check.names = FALSE)
o <- list(raw = DF3, count = DF, prop = DF2, rnp = rnp,
missing = rows.removed, percent = percent, zero.replace = zero.replace)
class(o) <- "question_type"
o
}
#' Prints a question_type object
#'
#' Prints a question_type object
#'
#' @param x The question_type object
#' @param \ldots ignored
#' @S3method print question_type
#' @method print question_type
print.question_type <-
function(x, ...) {
WD <- options()[["width"]]
options(width=3000)
print(x$rnp)
options(width=WD)
}
#' Plots a question_type Object
#'
#' Plots a question_type object.
#'
#' @param x The question_type object.
#' @param label logical. If TRUE the cells of the heat map plot will be labeled
#' with count and proportional values.
#' @param lab.digits Integer values specifying the number of digits to be
#' printed if \code{label} is TRUE.
#' @param percent logical. If TRUE output given as percent. If FALSE the
#' output is proportion. If NULL uses the value from
#' \code{\link[qdap]{question_type}}. Only used if \code{label} is TRUE.
#' @param zero.replace Value to replace 0 values with. If NULL uses the value
#' from \code{\link[qdap]{question_type}}. Only used if \code{label} is TRUE.
#' @param \ldots Other arguments passed to qheat.
#' @method plot question_type
#' @export
plot.question_type <- function(x, label = FALSE, lab.digits = 1, percent = NULL,
zero.replace = NULL, ...) {
if (label) {
if (!is.null(percent)) {
if (percent != x$percent) {
DF <- as.matrix(x$prop[, -c(1:2)])
if (percent) {
DF <- DF*100
} else {
DF <- DF/100
}
x$prop <- data.frame(x$prop[, 1:2], DF, check.names = FALSE)
}
} else {
percent <- x$percent
}
if (is.null(zero.replace)) {
zero.replace <- x$zero.replace
}
rnp <- raw_pro_comb(x$count[, -c(1:2), drop = FALSE],
x$prop[, -c(1:2), drop = FALSE], digits = lab.digits,
percent = percent, zero.replace = zero.replace)
rnp <- data.frame(x$count[, 1:2], rnp, check.names = FALSE)
qheat(x$prop, values=TRUE, mat2 = rnp, ...)
} else {
qheat(x$prop, ...)
}
}