-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgmi-html-atom.scm
More file actions
269 lines (235 loc) · 10.8 KB
/
gmi-html-atom.scm
File metadata and controls
269 lines (235 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
;;;; gmi-html-atom.scm - Convert gemtext .gmi file to an .atom file for HTML version.
;;;
;;; This needs to run in the same directory as the blog indexes it is reading.
;;;
;;; References:
;;; Subscribing to Gemini pages -- https://gemini.circumlunar.space/docs/companion/subscription.gmi
;;; RFC 4287 The Atom Syndication Format -- https://www.rfc-editor.org/info/rfc4287
;;; RFC 8288 Web Linking -- https://www.rfc-editor.org/info/rfc8288
;;; RFC 3339 Date and Time on the Internet: Timestamps -- https://www.rfc-editor.org/info/rfc3339
;;; RFC 3987 Internationalized Resource Identifiers (IRIs) -- https://www.rfc-editor.org/info/rfc3987
;;; RFC 8141 Uniform Resource Names (URNs) -- https://www.rfc-editor.org/info/rfc8141
;;; W3C Feed Validation Service -- https://validator.w3.org/feed/
(module gmi-html-atom ()
(import (scheme))
(import (utf8))
(import (chicken base))
(import (chicken file posix))
(import (chicken io))
(import (chicken irregex))
(import (chicken pathname))
(import (chicken port))
(import (chicken process-context))
(import (args))
(import (loop))
(import (schemepunk show))
(import (srfi 1))
(import (srfi 19))
(import (srfi 152))
(import (html-parser))
(import (sxpath))
(import (uri-generic))
(import (tkurtbond))
(define-syntax dbg
(syntax-rules ()
((_ e1 e2 ...)
(when *debugging*
e1 e2 ...
(flush-output (current-error-port))))))
(define (dfmt . args)
(apply show (cons (current-error-port) args)))
(define link-irx (string->irregex "^=> *(.+) +([0-9]{4}-[0-9]{2}-[0-9]{2}) ([0-9]{2}:[0-9]{2}:[0-9]{2})([-+][0-9]{2}:[0-9]{2}) +- *(.+)"))
(define (string-max s1 s2)
(if (string>=? s1 s2)
s1
s2))
(define (string-empty? s)
(= 0 (string-length s)))
(define (make-date-time date time tz)
(string-append date "T" time tz))
(define (drop-directories number-to-drop pathname)
(let*-values (((directory filename extension)
(decompose-pathname pathname)))
(if directory
(let*-values (((base-origin base-directory directory-elements)
(decompose-directory directory)))
(let* ((directories (drop directory-elements number-to-drop)))
(make-pathname directories filename extension)))
pathname)))
(define url-scheme-irx
(sre->irregex '(seq bos alphabetic (* (or alphabetic numeric #\+ #\- #\.)))))
(define (make-absolute directory href)
(let* ((url (cadr href))
(blog-uri (uri-reference (string-append *base-url* directory)))
(uri (uri-reference url)))
(when (relative-ref? uri)
(set-cdr! href (uri->string (uri-relative-to uri blog-uri))))))
(define (process-entry link)
(let* ((m (irregex-search link-irx link))
(gmi-relative-href (irregex-match-substring m 1))
(html-relative-href (pathname-replace-extension gmi-relative-href
".html"))
(href (string-append *base-url*
html-relative-href))
(blog-relative (string-append (pathname-directory
html-relative-href)
"/"))
(date (irregex-match-substring m 2))
(time (irregex-match-substring m 3))
(tz (irregex-match-substring m 4))
(title (irregex-match-substring m 5))
(updated (make-date-time date time tz))
(sxml (with-input-from-file html-relative-href
html->sxml))
(main ((sxpath '(html body main)) sxml))
(_ (loop for href in ((sxpath '(// main // a @ href))
sxml)
do (make-absolute blog-relative href)))
(_ (loop for src in ((sxpath '(// main // img @ src))
sxml)
do (make-absolute blog-relative src)))
;; This leaves BR tags as "<br></br>". I could do
;; (string-join (string-split html "<br></br>") "<br/>")
;; but I'm not sure it is worth the effort. I wonder if that's what
;; causes the extra blank lines... Nope.
(html (sxml->html main))
(html (string-join (string-split html "<br></br>") "<br/>"))
(content (html-escape html)))
(show #t " <entry>" nl)
(show #t " <title>" title "</title>" nl)
(show #t " <link href=\"" href "\"/>" nl)
(show #t " <id>" href "</id>" nl)
(show #t " <updated>" updated "</updated>" nl)
(show #t " <content type=\"html\">" nl)
(show #t content nl)
(show #t " </content>" nl)
(show #t " </entry>" nl nl)))
(define (process-feed update-date gmi-pathname title links)
(let* ((relative-gmi-pathname (drop-directories *directories-to-drop*
gmi-pathname))
(relative-html-filename (pathname-replace-extension
relative-gmi-pathname ".html"))
(relative-html-atom-filename
(pathname-replace-extension
(pathname-replace-file
gmi-pathname
(string-append (pathname-file gmi-pathname) "-html"))
"atom"))
(html-url (string-append *base-url* relative-html-filename))
(atom-url (string-append *base-url* relative-html-atom-filename))
)
(show #t "<?xml version=\"1.0\" encoding=\"utf-8\"?>" nl)
(show #t "<feed xml:lang=\"en\" xmlns=\"http://www.w3.org/2005/Atom\">" nl)
(show #t " <title>" title "</title>" nl)
(show #t " <link rel=\"alternate\" type=\"text/html\" href=\"" html-url "\"/>" nl)
(show #t " <link rel=\"self\" type=\"application/atom+xml\" href=\""
atom-url "\"/>" nl)
(show #t " <updated>" update-date "</updated>" nl)
(show #t " <author>" nl)
(show #t " <name>" *author-name* "</name>" nl)
(show #t " </author>" nl)
(show #t " <id>" atom-url "</id>" nl)
(show #t " <rights>Copyright © " (date->string (current-date) "~Y") " "
*author-name* "</rights>" nl)
(show #t " <generator uri=\"https://github.com/tkurtbond/microblog/blob/main/Tools/gmi-html-atom.scm\">gmi-html-atom</generator>" nl nl)
(loop for link in links do (process-entry link))
(show #t "</feed>" nl)))
(define (generate-atom gmi-pathname)
(let* ((input-port (open-input-file gmi-pathname))
;; We need to generate atom feeds for both the blog version
;; on the web and the glog version on geminispace. (Right
;; now we only do the atom feed for the HTML blog version.)
;; So we add "-html" to the gmi pathname's filename for the
;; atom output, to indicate that all the links in it point to
;; the web version with the HTML files. When we get output
;; the glog version done we'll add "-gmi" to that version,
;; indicating that all the links in it point to the
;; geminispace version with gemtext files.
(atom-pathname (pathname-replace-file
gmi-pathname
(string-append (pathname-file gmi-pathname) "-html")))
(atom-pathname (pathname-replace-extension atom-pathname "atom"))
(_ (info 1 "atom-pathname: ~s~%" atom-pathname))
(output-port (open-output-file atom-pathname))
(title #f)
(links '())
(update-date #f))
(define (process-line line)
(unless title
(when (string-prefix? "# " line)
(set! title (string-copy line 2))))
(let ((m (irregex-search link-irx line)))
(cond (m
(let* ((date (irregex-match-substring m 2))
(time (irregex-match-substring m 3))
(tz (irregex-match-substring m 4))
(title (irregex-match-substring m 5))
(date-time (make-date-time date time tz)))
(if update-date
(set! update-date (string-max update-date date-time))
(set! update-date date-time))
(list line)))
(else '()))))
(parameterize ((current-input-port input-port)
(current-output-port output-port)
)
(let ((links (loop for line = (read-line) then (read-line)
until (eof-object? line)
nconc (process-line line))))
(assert title 'generate-atom "Missing title")
(assert update-date 'generate-atom "Never found the most recent update")
(process-feed update-date gmi-pathname title links)
(close-input-port input-port)
(close-output-port output-port)
))))
;; Variables for command line options.
(define *author-name* #f)
(define *base-url* #f)
(define *directories-to-drop* 0)
(define (usage)
(with-output-to-port (current-error-port)
(lambda ()
(print "Usage: " (program-name) " [options...] [files...]")
(newline)
(print (args:usage +command-line-options+))
(show #t "Current argv: " (written (argv)) nl)))
(exit 1))
(define +command-line-options+
(list (args:make-option
(a author) #:required "Author name"
(set! *author-name* arg))
(args:make-option
(b base) #:required "Base URL for blog"
(set! *base-url* arg))
(args:make-option
(D drop) #:required
"Drop ARG directory names from the start of relative links."
(set! *directories-to-drop* (string->number arg)))
(args:make-option
(h help) #:none "Print usage message and exit."
(usage))
(args:make-option
(P prefix) #:required "Prefix to make links absolute."
(set! *prefix* arg))
(args:make-option
(v verbose) #:none "Increase the level of verbosity of information messages"
(increase-verbosity))
))
(define (main)
(receive (options operands) (args:parse (command-line-arguments)
+command-line-options+)
(unless *author-name* (die 127 "Author name was not specified~%"))
(unless *base-url* (die 127 "Base url was not specified~%"))
(loop for filename in operands
do (let-values (((directory filename extension)
(decompose-pathname filename)))
(let* ((directory (if (string=? directory ".") #f directory))
(filename (make-pathname directory filename extension)))
(generate-atom filename))))))
;; Only invoke main if this has been compiled. That way we can load the
;; module into csi and debug it.
(cond-expand
(compiling
(main))
(else))
)