-
Notifications
You must be signed in to change notification settings - Fork 16
/
highlight.R
229 lines (210 loc) · 9.14 KB
/
highlight.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# some special symbols (keywords a)
.keywords = c('FUNCTION', 'IF', 'ELSE', 'WHILE', 'FOR', 'IN', 'BREAK', 'REPEAT', 'NEXT', 'NULL_CONST')
.operators = c(
sprintf("'%s'", c('+', '-', '*', '/', '^', '$', '@', ':', '?', '~', '!')),
'SPECIAL', 'GT', 'GE', 'LT', 'LE', 'EQ', 'NE', 'AND', 'AND2', 'OR', 'OR2',
'NS_GET', 'NS_GET_INT'
)
.cmd.list = sort(c(
NUM_CONST = 'num', # numbers
SYMBOL_FUNCTION_CALL = 'kwd', # function calls
STR_CONST = 'sng', # character strings
COMMENT = 'com', # comment
SYMBOL_FORMALS = 'kwc', # function(formals)
SYMBOL_SUB = 'kwc', # FUN(args)
SLOT = 'kwc', # S4 slot
LEFT_ASSIGN = 'kwb', # assignment
EQ_ASSIGN = 'kwb',
RIGHT_ASSIGN = 'kwb',
setNames(rep('opt', length(.operators)), .operators),
setNames(rep('kwa', length(.keywords)), .keywords),
DEFAULT = 'def' # everything else
))
cmd_latex = data.frame(
cmd1 = paste('\\hl', .cmd.list, '{', sep = ''),
cmd2 = '}',
stringsAsFactors = FALSE,
row.names = names(.cmd.list)
)
cmd_latex = data.frame(
cmd1 = paste('\\hl', .cmd.list, '{', sep = ''),
cmd2 = '}',
stringsAsFactors = FALSE,
row.names = names(.cmd.list)
)
.cmd.pandoc = c(
num = 'DecValTok', kwd = 'KeywordTok', sng = 'StringTok', com = 'CommentTok',
kwc = 'DataTypeTok', kwb = 'NormalTok', opt = 'NormalTok', kwa = 'NormalTok',
def = 'NormalTok'
)
cmd_pandoc_latex = data.frame(
cmd1 = paste('\\', .cmd.pandoc[.cmd.list], '{', sep = ''),
cmd2 = '}',
stringsAsFactors = FALSE,
row.names = names(.cmd.list)
)
cmd_html = data.frame(
cmd1 = paste('<span class="hl ', .cmd.list, '">', sep = ''),
cmd2 = '</span>',
stringsAsFactors = FALSE,
row.names = names(.cmd.list)
)
# merge code and markups; use standard markups on unknown tokens
merge_cmd = function(pdata, cmd) {
res = cmd[pdata$token, ]
idx = is.na(res[, 1])
res[idx, 1] = cmd['DEFAULT', 1]
res[idx, 2] = cmd['DEFAULT', 2]
res[is.na(res)] = '' # if DEFAULT is undefined in the markup data frame
res
}
#' Syntax highlight an R code fragment
#'
#' This function \code{\link{parse}}s the R code, fetches the tokens in it
#' (\code{\link{getParseData}}), and attach syntax highlighting commands onto
#' them. With proper style definitions for these commands (such as colors or
#' font styles), the R code will be syntax highlighted in the LaTeX/HTML output.
#' The two functions \code{hi_latex} and \code{hi_html} are wrappers of
#' \code{hilight} for LaTeX and HTML output, respectively.
#'
#' For the \code{markup} data frame, the first column is put before the R
#' tokens, and the second column is behind; the row names of the data frame must
#' be the R token names; a special row is named \code{DEFAULT}, which contains
#' the markup for the standard tokens (i.e. those that do not need to be
#' highlighted); if missing, the built-in data frames \code{highr:::cmd_latex}
#' and \code{highr:::cmd_html} will be used.
#'
#' This function only binds markups onto R tokens, and the real syntax
#' highlighting must be done with style definitions, which is out of the scope
#' of this package. It was designed to be used as the syntax highlighting
#' infrastructure of other packages such as \pkg{knitr}, where the colors and
#' font styles are properly defined in the LaTeX preamble and HTML header.
#' @param code a character string (the R source code)
#' @param format the output format
#' @param markup a data frame of two columns containing the markup commands
#' @param prompt whether to add prompts to the code
#' @param fallback whether to use the fallback method, i.e. the regular
#' expression based method; this method is not precise and only highlights a
#' few types of symbols such as comments, strings and functions;
#' \code{fallback} will be set to \code{TRUE} when the input \code{code} fails
#' to be \code{\link{parse}d}
#' @param ... arguments to be passed to \code{hilight()}
#' @author Yihui Xie and Yixuan Qiu
#' @seealso See the package vignettes \code{browseVignettes('highr')} for how
#' this function works internally.
#' @return A character vector for the syntax highlighted code.
#' @examples library(highr)
#' hilight("x=1 # assignment")
#'
#' txt = c("a <- 1 # something", 'c(y="world", z="hello")', 'b=function(x=5) {',
#' 'for(i in 1:10) {
#' if (i < x) print(i) else break}}',
#' "z@@child # S4 slot", "'special chars <>#$%&_{}'")
#' cat(hi_latex(txt), sep = '\n')
#' cat(hi_html(txt), sep = '\n')
#'
#' # the markup data frames
#' highr:::cmd_latex; highr:::cmd_html
#' @import utils
#' @export
hilight = function(code, format = c('latex', 'html'), markup, prompt = FALSE, fallback = FALSE) {
if (length(code) == 0) return(code)
format = match.arg(format)
if (missing(markup) || is.null(markup))
markup = if (format == 'latex') cmd_latex else cmd_html
escape_fun = if (format == 'latex') escape_latex else escape_html
if (!fallback && !xfun::valid_syntax(code, silent = FALSE)) {
# the code is not valid, so you must use the fallback mode
warning('the syntax of the source code is invalid; the fallback mode is used')
fallback = TRUE
}
if (!prompt) return(
(if (fallback) hi_naive else hilight_one)(code, format, markup, escape_fun)
)
p1 = escape_fun(getOption('prompt')); p2 = escape_fun(getOption('continue'))
std = unlist(markup['DEFAULT', ])
if (!any(is.na(std))) {
p1 = paste0(std[1], p1, std[2]); p2 = paste0(std[1], p2, std[2])
}
code = xfun::split_source(code)
sapply(mapply(hilight_one, code, MoreArgs = list(format, markup, escape_fun),
SIMPLIFY = FALSE, USE.NAMES = FALSE),
function(x) paste0(rep(c(p1, p2), c(1L, length(x) - 1L)), x, collapse = '\n'))
}
# highlight one expression
hilight_one = function(code, format, markup, escape_fun) {
# the data frames do not need factors in this function; need to keep source
op = options(stringsAsFactors = FALSE, keep.source = TRUE); on.exit(options(op))
p = parse_source(code)
z = utils::getParseData(p)
if (NROW(z) == 0L || !any(z$terminal)) return(code)
z = z[z$terminal, ]
# record how empty lines before/after the code
one = paste(code, collapse = '\n')
r1 = '^(\\s*)\n.*'; r2 = '^.*?\n(\\s*)$'
s1 = if (grepl(r1, one)) gsub(r1, '\\1', one)
s2 = if (grepl(r2, one)) gsub(r2, '\\1', one)
res = cbind(z[, c('line1', 'col1', 'line2', 'col2', 'text')], merge_cmd(z, markup))
# escape special LaTeX/HTML chars
res$text = escape_fun(res$text)
# record how many blank lines after each token
blanks = c(pmax(res$line1[-1] - res$line2[-nrow(res)] - 1, 0), 0)
# add line breaks to the 8th column
res = cbind(res, strrep('\n', blanks))
# e.g. a string spans across multiple lines; now need to replace line1 with
# line2 so that we know the starting and ending positions of spaces; e.g. turn
# line/col numbers 1 5 2 6 into 2 5 2 6
for (i in which(res$line1 != res$line2)) {
res$line1[res$line1 == res$line1[i]] = res$line2[i]
}
out = lapply(split(res, res$line1), function(d) {
# merge adjacent tokens of the same type so that the output is cleaner
empty = matrix(FALSE, nrow = nrow(d), ncol = 2)
for (i in seq_len(nrow(d) - 1)) {
if (all(d[i, 6:7] == d[i + 1, 6:7])) empty[i + 1, 1] = empty[i, 2] = TRUE
}
d[, 6:7][empty] = ''
col = as.matrix(d[, c('col1', 'col2')])
# add 0 and remove col[n, 2] to get start/end positions of spaces
col = matrix(head(c(0, t(col)), -1), ncol = 2, byrow = TRUE)
paste(strrep(' ', col[, 2] - col[, 1] - 1), d[, 6], d[, 'text'], d[, 7],
d[, 8], sep = '', collapse = '')
})
c(s1, unlist(out, use.names = FALSE), s2)
}
#' @export
#' @rdname hilight
hi_latex = function(code, ...) hilight(code, 'latex', ...)
#' @export
#' @rdname hilight
hi_html = function(code, ...) hilight(code, 'html', ...)
#' A wrapper to Andre Simon's Highlight
#'
#' This function calls Highlight to syntax highlight a code fragment.
#' @param code a character string of the source code
#' @param language the input language (c, cpp, python, r, ...); see
#' \code{system('highlight -p')}
#' @param format the output format (html, latex, ...)
#' @references Andre Simon's Highlight package
#' \url{https://gitlab.com/saalen/highlight}.
#' @return A character string for the syntax highlighted code.
#' @export
#' @examples \dontrun{hi_andre('1+1', language='R')
#' hi_andre('void main() {\nreturn(0)\n}', language='c', format='latex')}
hi_andre = function(code, language, format = 'html') {
h = Sys.which('highlight')
os = Sys.info()[['sysname']]
# highlight on Linux Mint can be something else
# on OS10 with highlight installed using Homebrew it's often in /usr/local/bin
if (!nzchar(h) || (h == '/usr/local/bin/highlight' && os != 'Darwin' &&
!file.exists(h <- '/usr/bin/highlight')))
stop('please first install highlight from https://gitlab.com/saalen/highlight')
f = basename(tempfile('code', '.'))
writeLines(code, f); on.exit(unlink(f))
cmd = sprintf('%s -f -S %s -O %s %s', shQuote(h), correct_lang(language), format, f)
system(cmd, intern = TRUE)
}
# to help knitr engines decide the highlight language
correct_lang = function(x) {
switch(x, Rcpp = 'cpp', tikz = 'latex', Rscript = 'R', fortran = 'f', stan = 'R', x)
}