/
text-frontend.inc.vcl.erb
325 lines (283 loc) · 12.9 KB
/
text-frontend.inc.vcl.erb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// Varnish VCL include file for text frontends
include "text-common.inc.vcl";
include "zero.inc.vcl";
include "geoip.inc.vcl";
// Note that analytics.inc.vcl will set an X-Analytics value of proxy=IORG
// without inspecting whether there's an existing proxy=<proxy> key-
// value pair inside X-Analytics. We do this because if the traffic
// had come from a known proxy (e.g., Opera or Nokia), that would
// imply that Internet.org was not the rightmost endpoint. In time
// we will need to add the notion of proxy chaining to record whether
// something came through both a known proxy and had Via: Internet.org
// with a corresponding unknown rightmost endpoint (the rightmost
// Internet.org endpoint with an unpredictable Internet-facing IP
// address) in X-Forwarded-For, even if it's the only value, as in
// the example of traffic sourced directly by satellite or something.
sub mobile_redirect {
if (!req.http.X-Subdomain && (req.request == "GET" || req.request == "HEAD")
&& (req.http.User-Agent ~ "(?i)(mobi|240x240|240x320|320x320|alcatel|android|audiovox|bada|benq|blackberry|cdm-|compal-|docomo|ericsson|hiptop|htc[-_]|huawei|ipod|kddi-|kindle|meego|midp|mitsu|mmp\/|mot-|motor|ngm_|nintendo|opera.m|palm|panasonic|philips|phone|playstation|portalmmm|sagem-|samsung|sanyo|sec-|semc-browser|sendo|sharp|silk|softbank|symbian|teleca|up.browser|vodafone|webos)"
|| req.http.User-Agent ~ "^(?i)(lge?|sie|nec|sgh|pg)-" || req.http.Accept ~ "vnd.wap.wml")
&& req.http.Cookie !~ "(stopMobileRedirect=true|mf_useformat=desktop)"
&& req.http.User-Agent !~ "(SMART-TV.*SamsungBrowser)"
&& (
req.url ~ "^/(wiki|(gan|ike|iu|kk|ku|shi|sr|tg|uz|zh)(-[a-z]+)?)[/\?]"
|| req.url ~ "^/(w/index\.php)?\?title=[^&]*$"
)) {
// Separate regexps for clarity, but multiple regsubs instead of
// "if host ~"/regsub matches for efficiency. Be careful to not
// write overlapping/chaining regexps.
set req.http.MobileHost = req.http.Host;
set req.http.MobileHost = regsub(req.http.MobileHost, "^(www\.)?(mediawiki|wikisource|wikidata)\.", "m.\2.");
set req.http.MobileHost = regsub(req.http.MobileHost, "^(commons|incubator|legalteam|meta|office|outreach|pl|species|strategy|wikimania201[2-5])\.wikimedia\.", "\1.m.wikimedia.");
set req.http.MobileHost = regsub(req.http.MobileHost, "^((?!commons|meta|nostalgia|quote|quality|sep11|sources|species|textbook|m\b)\w+)\.(wikipedia|wiktionary|wikinews|wikisource|wikiquote|wikibooks|wikiversity|wikivoyage)\.", "\1.m.\2.");
if (req.http.Host != req.http.MobileHost) {
if (req.http.X-Forwarded-Proto) {
set req.http.Location = req.http.X-Forwarded-Proto + "://" + req.http.MobileHost + req.url;
} else {
set req.http.Location = "http://" + req.http.MobileHost + req.url;
}
error 666 "Found";
}
unset req.http.MobileHost;
}
}
sub cluster_fe_recv_pre_purge {
// Forged UAs on zerodot. This largely handles lazywebtools below, incidentally.
if (req.http.host ~ "zero\.wikipedia\.org" && req.http.User-Agent && req.http.User-Agent ~ "Facebookbot|Googlebot") {
error 403 "Noise";
}
if (req.http.referer && req.http.referer ~ "^http://(www\.(keeprefreshing|refreshthis|refresh-page|urlreload)\.com|tuneshub\.blogspot\.com|itunes24x7\.blogspot\.com|autoreload\.net|www\.lazywebtools\.co\.uk)/") {
error 403 "Noise";
}
if (req.request == "POST" && req.url ~ "index\.php\?option=com_jce&task=plugin&plugin=imgmanager&file=imgmanager&method=form&cid=") {
error 403 "Noise";
}
// FIXME: we're seeing an issue with Range requests and gzip/gunzip.
// Disable Range requests for now.
unset req.http.Range;
if (req.restarts == 0) {
// Always set or clear X-Subdomain and X-Orig-Cookie
unset req.http.X-Orig-Cookie;
unset req.http.X-Subdomain;
unset req.http.x-dt-host; // desktop host, if mobile hostname on request
if (req.http.host ~ "^([a-z0-9-]+\.)?zero\." && req.http.host != "zero.wikimedia.<%= @vcl_config.fetch('top_domain') %>") {
set req.http.X-Subdomain = "ZERO";
} else if (req.http.host ~ "^([a-z0-9-]+\.)?m\.") {
set req.http.X-Subdomain = "M";
}
// mobile-subdomains-only for tag-carrier and Host-rewrite
if (req.http.X-Subdomain) {
// Only do tag_carrier logic on first start, and only for (m|zero).wp
if (req.http.host ~ "^([a-z0-9-]+\.)?(m|zero)\.wikipedia\.") {
call tag_carrier;
}
// Rewrite mobile hostnames to desktop hostnames as x-dt-host
if (req.http.host == "m.mediawiki.org") {
set req.http.x-dt-host = "www.mediawiki.org";
} else if (req.http.host == "m.wikimediafoundation.org") {
set req.http.x-dt-host = "wikimediafoundation.org";
} else if (req.http.host == "m.wikisource.org") {
set req.http.x-dt-host = "wikisource.org";
} else if (req.http.host == "m.wikidata.org") {
set req.http.x-dt-host = "www.wikidata.org";
} else {
// Replace <language>.(m|zero).<project>.org by <language>.<project>.org
set req.http.x-dt-host = regsub(req.http.host, "^([a-z0-9-]+)\.(m|zero)\.", "\1.");
}
if (req.url ~ "^/api/rest_v1/") {
// for Restbase, there is no difference in desktop-vs-mobile hostnames,
// so rewrite mobile hostnames to desktop hostnames for singular caching
// (this affects the Host: header, and also the url rewrite for restbase
// elsewhere that uses req.http.host)
set req.http.host = req.http.x-dt-host;
}
}
// X-RB-NOREDIR: redirect=false optimization: T134464
// RB sends the same content regardless of ?redirect=false, but
// switches from 302 w/ Location (normal) to 200 w/o Location
// (?redirect=false) for wikitext redirect responses. We can
// make this more-efficient by doing this in Varnish and sharing
// the cache object (stripping the parameter and doing the
// transform at deliver time).
unset req.http.X-RB-NOREDIR; // do not let clients interfere!
if (req.url ~ "^/api/rest_v1/.*[?&]redirect=") {
// extract the redirect= value to boolean X-RB-NOREDIR for later
set req.http.X-RB-NOREDIR = regsub(req.url, "^.+[?&]redirect=([^&]+).*$", "\1");
if (req.http.X-RB-NOREDIR ~ "(?i)^(false|no|0)$") {
set req.http.X-RB-NOREDIR = "1";
} else {
unset req.http.X-RB-NOREDIR;
}
// Remove the redirect=X parameter from req.url to avoid cache
// fragmentation using two regexes to cover distinct cases:
// (1) Simple strip if final query arg:
set req.url = regsub(req.url, "[?&]redirect=[^&]+$", "");
// (2) When not the final arg, we need to capture the leading
// [?&] to reuse with the parameter that follows:
set req.url = regsub(req.url, "([?&])redirect=[^&]+&", "\1");
}
}
// Normalize paths before purging
call text_normalize_path;
}
sub cluster_fe_recv {
// Experiment on dealing with a buggy UA that's spamming requests in T141786
if (req.http.User-Agent ~ "Windows NT .*Chrome/41\.0\.2272\.76" && req.url == "/") {
<%= error_synth(741, "Buggy request, please report at https://phabricator.wikimedia.org/T141786") -%>
}
// BITS: legacy bits.wm.o domain support
if (req.http.host == "<%= @vcl_config.fetch('bits_domain') %>") {
if (req.url ~ "^/event\.gif") {
error 204;
}
return (lookup);
}
call mobile_redirect;
# normalize all /static to the same hostname for caching
if (req.url ~ "^/static/") { set req.http.host = "<%= @vcl_config.fetch("static_host") %>"; }
# normalize all /w/static.php to the same wiki host for caching
# ignore urls without hash query as those are affected by multiversion
if (req.url ~ "^/w/(skins|resources|extensions)/.+\?[a-fA-F0-9]+$" ) {
set req.http.host = "<%= @vcl_config.fetch("static_host") %>";
}
// shortener URLs can be validated and restricted here
if (req.http.host == "<%= @vcl_config.fetch('shortener_domain') %>") {
if (req.url !~ "^/[23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz$_]*$") {
<%= error_synth(404, "Short URL Not Found") -%>
}
}
// Users that just logged out, should not get a 304 for their
// (locally cached) logged in pages.
if (req.http.If-Modified-Since && req.http.Cookie ~ "LoggedOut") {
unset req.http.If-Modified-Since;
}
// The idea here is to occasionally redirect some of our worst clients
// (in TLS terms: those using 3DES, the bulk of which are not
// forward-secret, and the bulk of which are also IE[78]-XP) to our
// wikitech page about browser security, hoping it will prod them
// to upgrade.
// The X-Conn-Props cipher filter matches roughly 0.2% of traffic. By
// restricting to a UA string matching 'Windows NT' (for now), the
// percentage drops a bit more. We then further restrict to GETs of
// wiki pages, avoid the mobile domains JIC, and ignore clients that
// already have a cookie meaning we've done this to them once before
// during this browser session. After all the basic conditions are
// met, we then only apply the redirect to 2% of matching traffic
// randomly. We may increase that a bit in the future and/or widen the
// UA filter, but this is a good starting point.
if (req.http.X-Connection-Properties ~ "DES-CBC3"
&& req.http.User-Agent ~ "Windows NT" && req.request == "GET"
&& req.url ~ "^/wiki/" && req.http.host !~ "\.m\."
&& req.http.Cookie !~ "Browser-Security=Awful"
&& std.random(0,100) < 2.0) {
<%= error_synth(787, "Browser Security Warning Redirect") -%>
}
call text_common_recv;
}
sub cluster_fe_hash {
call text_common_hash;
}
sub cluster_fe_hit { }
sub cluster_fe_miss {
call text_common_misspass_restore_cookie;
}
sub cluster_fe_pass {
call text_common_misspass_restore_cookie;
}
<% if @varnish_version4 -%>
sub cluster_fe_backend_fetch { }
<% end -%>
sub cluster_fe_backend_response {
call text_common_backend_response;
// Four-hit-wonder experiment: if the immediate backend does not indicate
// hit/4 or higher, deliver the object as normal but do not create a new
// cache entry of any kind. We start caching in the frontend when an
// object is accessed for the 5th time across all frontends in this DC.
if (beresp.status == 200
&& bereq.http.X-CDIS == "miss"
&& beresp.http.X-Cache-Int !~ " hit/([4-9]|[0-9]{2,})$") {
set beresp.ttl = 0s;
<%- if @varnish_version4 -%>
set beresp.uncacheable = true;
return (deliver);
<%- else -%>
return (hit_for_pass);
<%- end -%>
}
return (deliver);
}
sub cluster_fe_deliver {
call zero_deliver;
# Other half of X-RB-NOREDIR (see cluster_fe_recv_pre_purge)
if (req.http.X-RB-NOREDIR) {
if (resp.status == 302) {
unset resp.http.Location;
set resp.status = 200;
set resp.response = "OK";
} elsif (resp.status == 301) {
// preserve the original client redirect preference on title redirects
if (resp.http.Location ~ "[?]") {
if (resp.http.Location !~ "[?&]redirect=") {
set resp.http.Location = resp.http.Location + "&redirect=false";
}
} else {
set resp.http.Location = resp.http.Location + "?redirect=false";
}
}
}
// Strip s-maxage Cache-Control of wiki pages. The s-maxage still applies to Varnish (sent
// by MediaWiki $wgUseSquid, sends purges internally). But pages musn't be cached elsewhere.
// NOTE: Language variants URLs are not currently covered by these regexps.
// Instead of writing regexps for every edge-case, we should impose some order and coherence
// on our URL routing schemes.
// NOTE: Only apply to pages. Don't steal cachability of api.php, load.php, etc. (T102898, T113007)
if (req.url ~ "^/wiki/" || req.url ~ "^/w/index\.php" || req.url ~ "^/\?title=") {
// ...but exempt CentralNotice banner special pages
if (req.url !~ "^/(wiki/|(w/index\.php)?\?title=)Special:Banner") {
set resp.http.Cache-Control = "private, s-maxage=0, max-age=0, must-revalidate";
}
}
// Perform GeoIP look-up and send the result as a session cookie
if (req.http.X-Orig-Cookie !~ "(^|;\s*)GeoIP=[^;]"
&& req.http.Cookie !~ "(^|;\s*)GeoIP=[^;]") {
call geoip_cookie;
}
// Fix old IPv6 no-data cookies
else if (req.http.X-Orig-Cookie ~ "(^|;\s*)GeoIP=:::::v6"
|| req.http.Cookie ~ "(^|;\s*)GeoIP=:::::v6") {
call geoip_cookie;
}
}
sub cluster_fe_err_synth {
// BITS: legacy bits.wm.o domain support
if (req.http.host == "<%= @vcl_config.fetch('bits_domain') %>") {
if (obj.status == 204) {
set obj.http.Connection = "keep-alive";
return (deliver);
}
}
// Support mobile redirects
if (obj.status == 666) {
set obj.http.Location = req.http.Location;
set obj.status = 302;
set obj.http.Connection = "keep-alive";
set obj.http.Content-Length = "0"; // BZ #62245
return (deliver);
}
// Chrome/41-on-Windows: T141786
if (obj.status == 741) {
set obj.status = 401;
set obj.http.WWW-Authenticate = {"Basic realm="Buggy request, please report at https://phabricator.wikimedia.org/T141786""};
return (deliver);
}
// Browser sec redirect, with a cookie to prevent repeats in the same session
// Note the cookie is is per-site for simplicity
if (obj.status == 787) {
set obj.status = 302;
set obj.http.Connection = "keep-alive";
set obj.http.Content-Length = "0"; // BZ #62245
set obj.http.Set-Cookie = "Browser-Security=Awful; Path=/; secure";
set obj.http.Location = "https://wikitech.wikimedia.org/wiki/HTTPS:_Browser_Recommendations";
return (deliver);
}
}