-
Notifications
You must be signed in to change notification settings - Fork 7
/
bytes_added.py
157 lines (139 loc) · 6.15 KB
/
bytes_added.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from sqlalchemy import func, case, cast, Integer
from sqlalchemy.sql.expression import label
from wtforms.validators import Required
from wikimetrics.utils import thirty_days_ago, today
from wikimetrics.models import Revision, Page
from wikimetrics.forms.fields import (
BetterDateTimeField,
BetterBooleanField,
CommaSeparatedIntegerListField,
)
from timeseries_metric import TimeseriesMetric
class BytesAdded(TimeseriesMetric):
"""
This class implements bytes added logic.
An instance of the class is callable and will compute four different aggregations of
the bytes contributed or removed from a mediawiki instance:
* net_sum : bytes added minus bytes removed
* absolute_sum : bytes added plus bytes removed
* positive_only_sum : bytes added
* negative_only_sum : bytes removed
This is the sql query that sqlalchemy generates from our Bytes Added logic
SELECT anon_1.rev_user AS anon_1_rev_user,
sum(anon_1.byte_change) AS net_sum,
sum(abs(anon_1.byte_change)) AS absolute_sum,
sum(CASE
WHEN (anon_1.byte_change > 0)
THEN anon_1.byte_change
ELSE 0 END
) AS positive_only_sum,
sum(CASE
WHEN (anon_1.byte_change < 0)
THEN anon_1.byte_change
ELSE 0 END
) AS negative_only_sum
FROM (SELECT revision.rev_user AS rev_user,
( cast(revision.rev_len as signed)
- cast(coalesce(anon_2.rev_len, 0) as signed)
) AS byte_change
FROM revision
INNER JOIN
page ON page.page_id = revision.rev_page
LEFT OUTER JOIN
(SELECT revision.rev_id AS rev_id,
revision.rev_len AS rev_len
FROM revision
) AS anon_2 ON revision.rev_parent_id = anon_2.rev_id
WHERE page.page_namespace IN ('0')
AND revision.rev_user IN (3174352)
AND revision.rev_timestamp BETWEEN '2013-06-18' AND '2013-07-18'
) AS anon_1
GROUP BY anon_1.rev_user
"""
show_in_ui = True
id = 'bytes-added'
label = 'Bytes Added'
category = 'Content'
default_submetric = 'net_sum'
description = 'Compute different aggregations of the bytes\
contributed or removed from a mediawiki project'
# filled in below as the default depends on options
default_result = {}
namespaces = CommaSeparatedIntegerListField(
None,
[Required()],
default='0',
description='0, 2, 4, etc.',
)
positive_only_sum = BetterBooleanField(default=True)
negative_only_sum = BetterBooleanField(default=True)
absolute_sum = BetterBooleanField(default=True)
net_sum = BetterBooleanField(default=True)
def __call__(self, user_ids, session):
"""
Parameters:
user_ids : list of mediawiki user ids to find bytes added for
session : sqlalchemy session open on a mediawiki database
Returns:
dictionary from user ids to another dictionary with the following keys:
* net_sum : bytes added minus bytes removed
* absolute_sum : bytes added plus bytes removed
* positive_only_sum : bytes added
* negative_only_sum : bytes removed
"""
start_date = self.start_date.data
end_date = self.end_date.data
PreviousRevision = session.query(Revision.rev_len, Revision.rev_id).subquery()
query = session.query(
Revision.rev_user,
Revision.rev_timestamp,
label(
'byte_change',
cast(Revision.rev_len, Integer) -
cast(func.coalesce(PreviousRevision.c.rev_len, 0), Integer)
),
)\
.join(Page)\
.outerjoin(
PreviousRevision,
Revision.rev_parent_id == PreviousRevision.c.rev_id)\
.filter(Page.page_namespace.in_(self.namespaces.data))\
.filter(Revision.rev_timestamp > start_date)\
.filter(Revision.rev_timestamp <= end_date)
query = self.filter(query, user_ids)
BC = query.subquery()
bytes_added_by_user = session.query(BC.c.rev_user).group_by(BC.c.rev_user)
# add submetrics as columns to the output
submetrics = []
index = 1
if self.net_sum.data:
submetrics.append(('net_sum', index, 0))
bytes_added_by_user = bytes_added_by_user.add_column(
func.sum(BC.c.byte_change).label('net_sum')
)
index += 1
if self.absolute_sum.data:
submetrics.append(('absolute_sum', index, 0))
bytes_added_by_user = bytes_added_by_user.add_column(
func.sum(func.abs(BC.c.byte_change)).label('absolute_sum'),
)
index += 1
if self.positive_only_sum.data:
submetrics.append(('positive_only_sum', index, 0))
bytes_added_by_user = bytes_added_by_user.add_column(
func.sum(case(
[(BC.c.byte_change > 0, BC.c.byte_change)], else_=0
)).label('positive_only_sum'),
)
index += 1
if self.negative_only_sum.data:
submetrics.append(('negative_only_sum', index, 0))
bytes_added_by_user = bytes_added_by_user.add_column(
func.sum(case(
[(BC.c.byte_change < 0, BC.c.byte_change)], else_=0
)).label('negative_only_sum'),
)
index += 1
self.default_result = {s[0]: s[2] for s in submetrics}
query = self.apply_timeseries(bytes_added_by_user, column=BC.c.rev_timestamp)
return self.results_by_user(user_ids, query, submetrics, date_index=index)