-
Notifications
You must be signed in to change notification settings - Fork 88
/
analytics.inc.vcl.erb
428 lines (373 loc) · 17.3 KB
/
analytics.inc.vcl.erb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
<% if @vcl_config.fetch('differential_privacy', false) -%>
include "analytics-dp-helper.vcl";
<% end -%>
/*****************************************************************************
* Varnish VCL for WMF-Last-Access Cookie
* Please see what this cookie is trying to acomplish:
* https://wikitech.wikimedia.org/wiki/Analytics/Unique_clients/Last_visit_solution
*
* General notes on timestamp format strings used here:
* "now" stringifies as "Wed, 01 Jan 2000 01:01:01 GMT", which is the same
* format used by Set-Cookie "Expires" data. The format for the last access
* value, and thus X-NowDay and X-WMF-LastStamp as well, is "01-Jan-2000"
* (because the other info is redundant or too-specific, and cookie values
* shouldn't have whitespace or commas).
*
* This file depends upon blocked-nets.inc.vcl. Include that first.
****************************************************************************/
/*****************************************************************************
* This must be called *before* any vcl_recv cookie munging. It more-properly
* belongs in _deliver, but putting it here avoids all of the issues
* surrounding consistent access to Cookie vs X-Orig-Cookie in vcl_deliver
* It does so at the cost of sending a pointless and unintended
* "X-WMF-LastStamp: 01-Jan-2000" header to the application layer as well on
* cache miss/bypass.
* Note we don't validate that the cookie's 3-letter month abbreviation is
* legal, or that the numeric values for the date/year are legal, just that
* they have the right count of the right kinds of characters.
****************************************************************************/
sub analytics_last_access_recv_ {
unset req.http.X-WMF-LastStamp; // clear any sent by the user
if (req.http.Cookie ~ "(^|;\s*)WMF-Last-Access=[0-9]{2}-[A-Za-z]{3}-[0-9]{4}(;|$)") {
// Save the value for use later in _deliver
set req.http.X-WMF-LastStamp = regsub(
req.http.Cookie,
"^(?:.*;\s*)?WMF-Last-Access=([^;]+).*$",
"\1"
);
}
}
sub analytics_last_access_global_recv_ {
unset req.http.X-WMF-LastGlobalStamp; // clear any sent by the user
if (req.http.Cookie ~ "(^|;\s*)WMF-Last-Access-Global=[0-9]{2}-[A-Za-z]{3}-[0-9]{4}(;|$)") {
// Save the value for use later in _deliver
set req.http.X-WMF-LastGlobalStamp = regsub(
req.http.Cookie,
"^(?:.*;\s*)?WMF-Last-Access-Global=([^;]+).*$",
"\1"
);
}
}
/*****************************************************************************
* !!! private to analytics_last_access_deliver !!!!
* Use a 12h-resolution expiry so people cannot infer an exact request time.
****************************************************************************/
sub set_last_access_cookie__ {
header.append(resp.http.Set-Cookie,
"WMF-Last-Access="
+ req.http.X-NowDay
+ ";Path=/;HttpOnly;secure;Expires="
+ std.time(std.time2integer(now + 32d, 0) / 43200 * 43200, now)
);
}
/******************************************************************************
* Same as set_last_access_cookie__ but with Domain attribute
* (eg: Domain=.wikipedia.org)
******************************************************************************/
sub set_last_access_global_cookie__ {
header.append(resp.http.Set-Cookie,
"WMF-Last-Access-Global="
+ req.http.X-NowDay
+ ";Path=/;Domain=."
+ regsub(req.http.Host, "^([a-z0-9-]+\.)*([a-z0-9-]+\.<%= @vcl_config.fetch('top_domain', 'org') %>)$", "\2")
+ ";HttpOnly;secure;Expires="
+ std.time(std.time2integer(now + 32d, 0) / 43200 * 43200, now)
);
}
// Call from vcl_deliver near other X-Analytics code
sub analytics_last_access_deliver_ {
// Create X-NowDay in "01-Jan-2000" form, from "now"
set req.http.X-NowDay = regsub(
now, "^..., (..) (...) (....) .*$", "\1-\2-\3"
);
/*
* WMF-Last-Access
*/
if(req.http.Host != "api.wikimedia.org") {
if(req.http.X-WMF-LastStamp) {
set resp.http.X-Analytics = resp.http.X-Analytics
+ ";WMF-Last-Access="
+ req.http.X-WMF-LastStamp;
// re-set the cookie if it's not from today
if (req.http.X-NowDay != req.http.X-WMF-LastStamp) {
call set_last_access_cookie__;
}
}
else {
// sets the initial cookie if no valid one existed
call set_last_access_cookie__;
}
}
/*
* WMF-Last-Access-Global (not for wikimedia.org)
*/
if(req.http.Host !~ "\.wikimedia\.org$") {
if(req.http.X-WMF-LastGlobalStamp) {
set resp.http.X-Analytics = resp.http.X-Analytics
+ ";WMF-Last-Access-Global="
+ req.http.X-WMF-LastGlobalStamp;
// re-set the cookie if it's not from today
if (req.http.X-NowDay != req.http.X-WMF-LastGlobalStamp) {
call set_last_access_global_cookie__;
}
}
else {
// sets the initial cookie if no valid one existed
call set_last_access_global_cookie__;
}
}
// we could clean up req.http.X-WMF-LastStamp + req.http.X-NowDay
// here, but they're not being sent anywhere (else) at this point
// anyways, so why bother?
}
/*****************************************************************************
* Analytics for "wprov" Provenance data
* See https://www.mediawiki.org/wiki/Provenance for reserved values.
****************************************************************************/
sub analytics_provenance_recv_ {
// Avoid cache fragmentation for well-formed provenance parameters
// Refer to discussion starting from
// https://lists.wikimedia.org/pipermail/analytics/2015-February/003426.html
// Look for wprov parameter with a value
if (req.url ~ "(?i)[?&]wprov=[^&]+") {
// Ready a variable for later X-Analytics tagging in vcl_deliver.
// Grab just the value of the wprov parameter, excluding the rest of the URL
set req.http.X-WMF-WPROV = regsub(req.url, "(?i).+[?&]wprov=([^&]+).*", "\1");
// Remove the wprov=X parameter from req.url to avoid cache
// fragmentation using two regexes to cover distinct cases:
// (1) Simple strip if final query arg:
set req.url = regsub(req.url, "(?i)[?&]wprov=[^&]+$", "");
// (2) When not the final arg, we need to capture the leading
// [?&] to reuse with the parameter that follows:
set req.url = regsub(req.url, "(?i)([?&])wprov=[^&]+&", "\1");
}
}
sub analytics_provenance_deliver_ {
// In case there was a provenance parameter with a value, add it to X-Analytics
if (req.http.X-WMF-WPROV) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";wprov=" + req.http.X-WMF-WPROV;
}
}
<% if @vcl_config.fetch('differential_privacy', false) -%>
/*****************************************************************************
* Differential Privacy Cookie
* See https://meta.wikimedia.org/wiki/Differential_privacy for more details.
****************************************************************************/
/**
* Receive and deliver client-side filtering for differential privacy — basically,
* we hash the page ID, truncate it to just three characters, check to see if
* is in the list already, and append it to the list if not.
* After the 10th pageview is seen, we exclude all following pageviews.
* This process resets at midnight UTC.
* Ensure that DP data never leaves Varnish
*/
sub analytics_differential_privacy_recv_ {
unset req.http.X-Include-PV; // clear any sent by the user
// If the cookie has argument in the form "include_pv=0" (auto-exclude; no further processing needed)
if (req.http.Cookie ~ "(^|;\s*)include_pv=0(;|$)") {
// Set to exclude for use later in _deliver
set req.http.X-Include-PV = "0";
}
unset req.http.X-WMF-DP; // clear any sent by the user
// If the cookie has argument in the form "WMF-DP=21f,90a,...123," (between 1-10 hex hashes)
if (req.http.Cookie ~ "(^|;\s*)WMF-DP=([a-f0-9]{3},?){1,10}(;|$)") {
set req.http.X-WMF-DP = regsub(
req.http.Cookie,
"^(?:.*;\s*)?WMF-DP=([^;]+).*$",
"\1"
);
}
// Remove the WMF-DP cookie cause we don't want this leaving varnish
set req.http.Cookie = regsub(
req.http.Cookie,
"(^|;\s*)WMF-DP=[^;]*",
""
);
}
/*****************************************************************************
* !!! private to analytics_last_access_deliver !!!!
* Always expire at midnight UTC by adding 12 hours and rounding to nearest day
****************************************************************************/
sub set_differential_privacy_cookie__ {
header.append(resp.http.Set-Cookie,
"WMF-DP="
+ req.http.X-WMF-DP
+ ";Path=/;HttpOnly;secure;Expires="
+ std.time(std.time2integer(now + 12h, 0) / 86400 * 86400, now)
);
}
/*****************************************************************************
* Excludes all following pageview via include_pv=0 cookie
* Remove WMF-DP cookie
****************************************************************************/
sub override_differential_privacy_cookie__ {
// RFC 6265 requires that the server sets the expire date in the past
// to remove a cookie (https://www.rfc-editor.org/rfc/rfc6265.html#section-3.1)
header.append(resp.http.Set-Cookie,
"WMF-DP=;Path=/;HttpOnly;secure;Expires=Sun, 06 Nov 1994 08:49:37 GMT");
header.append(resp.http.Set-Cookie,
"include_pv=0;Path=/;HttpOnly;secure;Expires="
+ std.time(std.time2integer(now + 12h, 0) / 86400 * 86400, now)
);
}
sub analytics_differential_privacy_deliver_ {
// If known exclude from _recv, pass along but no other action required
if (req.http.X-Include-PV == "0") {
set resp.http.X-Analytics = resp.http.X-Analytics + ";include_pv=0";
} else {
// Article pageview (x-analytics has namespace=0 and page ID) so we should process for DP
if (resp.http.X-Analytics ~ "(^|;\s*)ns=0(;|$)" && resp.http.X-Analytics ~ "(^|;\s*)page_id=(\d+).*(;|$)") {
unset req.http.X-Page-ID; // clear any sent by the user
set req.http.X-Page-ID = regsub(
resp.http.X-Analytics,
".*page_id=(\d+).*",
"\1"
);
}
if (req.http.X-Page-ID) { // pageview to article
// If list doesn't exist, create it as an empty string
if (!req.http.X-WMF-DP) {
set req.http.X-WMF-DP = "";
}
// Default to not including the page view
set req.http.X-Include-PV = "0";
// If X-WMF-DP cookie has between 0 and 35 chars
// Note: 35 chars because we're looking at 9 pages so far with 4 chars / page (3 char hex + comma)
// Example: '1b6,2b6,3b6,4b6,5b6,6b6,7b6,8b6,9b6'
if (req.http.X-WMF-DP ~ "^.{0,35}$") {
// call helper function to hash the page ID
call dp_hash_vcl;
// If the hash is already in the list, X-Include-PV already set to 0 and cookie doesn't change
// If the hash is not in the list, add it to the list, set X-Include-PV to 1, and update cookies
if (!std.strstr(req.http.X-WMF-DP, req.http.X-Hash)) {
if (req.http.X-WMF-DP != "") { // avoid dangling commas
set req.http.X-WMF-DP = req.http.X-WMF-DP + "," + req.http.X-Hash;
} else {
set req.http.X-WMF-DP = req.http.X-Hash;
}
set req.http.X-Include-PV = "1";
// if this is 10th pageview, clear cookie and exclude this and all following pageviews
if (req.http.X-WMF-DP ~ "^.{39,}$") {
call override_differential_privacy_cookie__;
} else { // update WMF-DP cookie with new page ID
call set_differential_privacy_cookie__;
}
}
}
// Set X-Analytics to include X-Include-PV
set resp.http.X-Analytics = resp.http.X-Analytics
+ ";include_pv="
+ req.http.X-Include-PV;
}
}
}
<% end -%>
/*****************************************************************************
* Combined analytics recv and deliver hooks, to be included directly in
* vcl_recv and vcl_deliver in common wikimedia.vcl - these are the only
* "public" interfaces in this file!
****************************************************************************/
sub analytics_recv {
// If this request had no cookies whatsoever mark it as such
// to later report this fact to X-Analytics
if (!req.http.Cookie) {
set req.http.X-WMF-NOCOOKIES = 1;
} else if(req.http.Cookie ~ "([sS]ession|Token)=")
{
set req.http.X-WMF-SESSIONCOOKIE = 1;
}
call analytics_provenance_recv_;
}
// Not all hosts using analytics_recv need to use the "WMF-Last-Access" cookie,
// so split this into its own subroutine.
sub analytics_last_access_recv {
call analytics_last_access_recv_;
// Global last access cookies not used w/ wikimedia.org
if(req.http.Host !~ "\.wikimedia\.org$") {
call analytics_last_access_global_recv_;
}
<% if @vcl_config.fetch('differential_privacy', false) -%>
call analytics_differential_privacy_recv_;
<% end -%>
}
sub analytics_deliver_pre {
// Create empty header if none, to avoid tons of if/else clauses; will
// clean up at the end. Note that if we defined one of the k=v pairs as
// required (having a real value for the false/negative case), we could
// set that one first and this would get a bit cleaner...
if (!resp.http.X-Analytics) {
set resp.http.X-Analytics = "";
}
}
sub analytics_deliver_last_access {
call analytics_last_access_deliver_;
}
sub analytics_deliver_post {
call analytics_provenance_deliver_;
<% if @vcl_config.fetch('differential_privacy', false) -%>
call analytics_differential_privacy_deliver_;
<% end -%>
# We check allowed values inbound in X-Analytics header
#
# At this time there are only two values we let clients send:
# pageview=1 and preview=1 and they are mutually exclusive
# https://wikitech.wikimedia.org/wiki/X-Analytics#Keys
# Any value that we decide clients can send in the future should be
# whitelisted here
if (req.http.X-Analytics ~ "(^|;)pageview=1(;|$)") {
set resp.http.X-Analytics = resp.http.X-Analytics + ";pageview=1";
} else if (req.http.X-Analytics ~ "(^|;)preview=1(;|$)") {
set resp.http.X-Analytics = resp.http.X-Analytics + ";preview=1";
}
if (req.http.X-Trusted-Proxy) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";proxy=" + req.http.X-Trusted-Proxy;
}
if (req.http.X-Forwarded-Proto) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";https=1";
}
if (req.http.X-WMF-UUID) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";wmfuuid=" + req.http.X-WMF-UUID;
}
# We set debug=1, so analytics won't count those requests as real pageviews.
if (req.http.X-Wikimedia-Debug) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";debug=1";
}
# Add client TCP source port to webrequest (T271953, T181368)
if (req.http.X-Client-Port) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";client_port=" + req.http.X-Client-Port;
}
// Add proxy=IORG X-Analytics tag if appropriate.
// Although Via: Internet.org usually comes via proxying, it isn't guaranteed to come that way.
// Nonetheless, as it is tagged with Via and the equipment is under Internet.org, we proxy tag.
if (req.http.Via ~ "(?i)Internet\.org") {
set resp.http.X-Analytics = resp.http.X-Analytics + ";proxy=IORG";
}
// Add Google Translate translationengine X-Analytics tag for measuring impact.
// Via is largely standardized and is likely to stay. Other characteristics of the
// request, including the Referer, can be combined for greater confidence.
if (req.http.X-WMF-Translation-Engine) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";translationengine=" + req.http.X-WMF-Translation-Engine;
}
if (req.http.X-WMF-NOCOOKIES) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";nocookies=1";
}
if (req.http.X-WMF-SESSIONCOOKIE) {
set resp.http.X-Analytics = resp.http.X-Analytics + ";sessioncookie=1";
}
if (req.http.X-Public-Cloud) {
// do we still need this considering req.http.X-Public-Cloud?
set resp.http.X-Analytics = resp.http.X-Analytics + ";public_cloud=1";
}
// Add to X-Analytics the matching requestctl patterns, comma separated. T305582
if (req.http.X-Requestctl && req.http.X-Requestctl != "") {
// Remove the comma at the start of the line, if present.
set req.http.X-Requestctl = regsub(req.http.X-Requestctl, "^,", "");
set resp.http.X-Analytics = resp.http.X-Analytics + ";requestctl=" + req.http.X-Requestctl;
}
// Clean up header from setting to empty at the start...
if (resp.http.X-Analytics == "") {
unset resp.http.X-Analytics;
} else {
set resp.http.X-Analytics = regsub(resp.http.X-Analytics, "^;", "");
}
}