/
language_model.R
196 lines (176 loc) · 7.13 KB
/
language_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#' k-gram Language Models
#'
#' @description
#'
#' Build a k-gram language model.
#'
#' ### Principal methods supported by objects of class \code{language_model}
#'
#' - \code{probability()}: compute word continuation and sentence probabilities.
#' See \link[kgrams]{probability}.
#'
#' - \code{sample_sentences()}: generate random text by sampling from the
#' language model probability distribution at arbitary temperature. See
#' \link[kgrams]{sample_sentences}.
#'
#' - \code{perplexity()}: Compute the language model perplexity on a test
#' corpus. See \link[kgrams]{perplexity}.
#'
#'
#' @author Valerio Gherardi
#' @md
#'
#'
#' @param object an object which stores the information required to build the
#' k-gram model. At present, necessarily a \code{kgram_freqs} object, or a
#' \code{language_model} object of which a copy is desired (see Details).
#' @param N a length one integer. Maximum order of k-grams to use in the language
#' model. This muss be less than or equal to the order of the underlying
#' \code{kgram_freqs} object.
#' @param smoother a length one character vector. Indicates the smoothing
#' technique to be applied to compute k-gram continuation probabilities. A list
#' of available smoothers can be obtained with \code{smoothers()}, and
#' further information on a particular smoother through
#' \code{info()}.
#' @param ... possible additional parameters required by the smoother.
#'
#' @return A \code{language_model} object.
#' @details
#' These generics are used to construct objects of class \code{language_model}.
#' The \code{language_model} method is only needed to create copies of
#' \code{language_model} objects (that is to say, new copies which are not
#' altered by methods which modify the original object in place,
#' see e.g. \link[kgrams]{parameters}). The discussion below focuses on
#' language models and the \code{kgram_freqs} method.
#'
#' \link[kgrams]{kgrams} supports several k-gram language models, including
#' Interpolated Kneser-Ney, Stupid Backoff and others
#' (see \link[kgrams]{smoothers}). The objects created by
#' \code{language_models()} have methods for computing word continuation and
#' sentence probabilities (see \link[kgrams]{probability}),
#' random text generation (see \link[kgrams]{sample_sentences})
#' and other type of language modeling tasks such as computing perplexities and
#' word prediction accuracies.
#'
#' Smoothers have often tuning parameters, which need to be specified by
#' (exact) name through the \code{...} arguments; otherwise,
#' \code{language_model()} will use default values and, once per session, throw
#' a warning. \code{info(smoother)} lists all parameters needed by a
#' specific smoother, together with their allowed parameter space.
#'
#' The run-time of \code{language_model()} may vary substantially for different
#' smoothing methods, depending on whether or not a method requires the
#' computation of additional quantities (that is to say, beyond k-gram counts)
#' for its operativity (this is, for instance, the case for the Kneser-Ney
#' smoother).
#' @examples
#' # Create an interpolated Kneser-Ney 2-gram language model
#'
#' freqs <- kgram_freqs("a a b a a b a b a b a b", 2)
#' model <- language_model(freqs, "kn", D = 0.5)
#' model
#' summary(model)
#' probability("a" %|% "b", model)
#'
#' # For more examples, see ?probability, ?sample_sentences and ?perplexity.
#'
#' @name language_model
#' @rdname language_model
#' @export
language_model <- function(object, ...)
UseMethod("language_model", object)
#' @rdname language_model
#' @export
language_model.language_model <- function(object, ...) {
cpp_freqs <- attr(object, "cpp_freqs")
smoother <- attr(object, "smoother")
args <- parameters(object)
N <- args[["N"]]
cpp_obj <- cpp_smoother_constructor(smoother, cpp_freqs, N, args)
new_language_model(
cpp_obj,
cpp_freqs,
attr(object, ".preprocess"),
attr(object, ".tknz_sent"),
smoother
)
}
#' @rdname language_model
#' @export
language_model.kgram_freqs <-
function(object, smoother = "ml", N = param(object, "N"), ...)
{
assert_positive_integer(N)
if (N > param(object, "N")) {
h <- "Invalid input"
x <- "'N' cannot be greater than 'param(object, \"N\")'."
rlang::abort(c(h, x = x), class = "kgrams_lm_max_order_error")
}
validate_smoother(smoother, ...)
args <- list(...)
for (parameter in list_parameters(smoother))
if (is.null(args[[parameter$name]]))
args[[parameter$name]] <- parameter$default
cpp_freqs <- attr(object, "cpp_obj")
cpp_obj <- cpp_smoother_constructor(smoother, cpp_freqs, N, args)
new_language_model(
cpp_obj,
cpp_freqs,
attr(object, ".preprocess"),
attr(object, ".tknz_sent"),
smoother
)
}
#----------------------------- printing methods -------------------------------#
#' @export
print.language_model <- function(x, ...) {
cat("A k-gram language model.\n")
return(invisible(x))
}
#' @export
summary.language_model <- function(object, ...) {
cat("A k-gram language model.\n\n")
cat("Smoother:\n")
cat("* '", attr(object, "smoother"), "'.\n", sep = "")
cat("\n")
cat("Parameters:\n")
for (name in names(parameters(object)))
cat("* ", name, ": ", param(object, name), "\n", sep = "")
cat("\n")
cat("Number of words in training corpus:\n")
cat("* W: ", attr(object, "cpp_freqs")$tot_words(), "\n", sep = "")
cat("\n")
cat("Number of distinct k-grams with positive counts:\n")
for (k in 1:param(object, "N"))
cat("* ", k, "-grams:", attr(object, "cpp_freqs")$unique(k),
"\n", sep = "")
return(invisible(object))
}
#' @export
str.language_model <- function(object, ...) summary(object)
#---------------------------------- internal ----------------------------------#
new_language_model <- function(
cpp_obj, cpp_freqs, .preprocess, .tknz_sent, smoother
)
{
structure(list(),
cpp_obj = cpp_obj,
cpp_freqs = cpp_freqs,
.preprocess = .preprocess,
.tknz_sent = .tknz_sent,
smoother = smoother,
class = c("language_model")
)
}
cpp_smoother_constructor <- function(smoother, cpp_freqs, N, args) {
switch(smoother,
sbo = new(SBOSmoother, cpp_freqs, N, args[["lambda"]]),
add_k = new(AddkSmoother, cpp_freqs, N, args[["k"]]),
ml = new(MLSmoother, cpp_freqs, N),
kn = new(KNSmoother, cpp_freqs, N, args[["D"]]),
mkn = new(mKNSmoother, cpp_freqs, N,
args[["D1"]], args[["D2"]], args[["D3"]]),
abs = new(AbsSmoother, cpp_freqs, N, args[["D"]]),
wb = new(WBSmoother, cpp_freqs, N)
)
}