Skip to content

Commit

Permalink
Merge "add. 'pages created' metric."
Browse files Browse the repository at this point in the history
  • Loading branch information
rfaulk authored and Gerrit Code Review committed Apr 19, 2013
2 parents 9b1282d + e23dfc3 commit 85562e6
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 0 deletions.
3 changes: 3 additions & 0 deletions user_metrics/api/engine/request_meta.py
Expand Up @@ -280,6 +280,7 @@ def __new__(cls):
varMapping('look_ahead',
'look_ahead')],
'survival': common_params,
'pages_created': common_params,
'threshold': common_params + [varMapping('n', 'n')],
'time_to_threshold': common_params +
[varMapping('threshold_type', 'threshold_type_class')],
Expand Down Expand Up @@ -316,6 +317,7 @@ def map(request_meta):
from user_metrics.metrics.namespace_of_edits import NamespaceEdits, \
namespace_edits_sum
from user_metrics.metrics.live_account import LiveAccount, live_accounts_agg
from user_metrics.metrics.pages_created import PagesCreated


# Registered metrics types
Expand All @@ -330,6 +332,7 @@ def map(request_meta):
'edit_rate': EditRate,
'namespace_edits': NamespaceEdits,
'live_account': LiveAccount,
'pages_created': PagesCreated,
}

# @TODO: let metric types handle this mapping themselves and obsolete this
Expand Down
122 changes: 122 additions & 0 deletions user_metrics/metrics/pages_created.py
@@ -0,0 +1,122 @@

__author__ = {'Evan Rosen': 'erosen@wikimedia.org'}
__date__ = "April 14th, 2013"
__license__ = "GPL (version 2 or later)"

from user_metrics.config import logging

import os
import user_metrics.utils.multiprocessing_wrapper as mpw
import user_metric as um
from user_metrics.etl.aggregator import decorator_builder, boolean_rate
from user_metrics.metrics import query_mod
from user_metrics.metrics.users import UMP_MAP


class PagesCreated(um.UserMetric):
"""
Skeleton class for "PagesCreated" metric:
`https://meta.wikimedia.org/wiki/Research:Metrics/pages_created`
This metric computes how often a user has been reverted
As a UserMetric type this class utilizes the process() function
attribute to produce an internal list of metrics by
user handle (typically ID but user names may also be specified).
The execution of process() produces a nested list that
stores in each element:
* User ID
* Number of pages created by user
"""

# Structure that defines parameters for Threshold class
_param_types = {
'init': {},
'process': {}
}

# Define the metrics data model meta
_data_model_meta = {
'id_fields': [0],
'date_fields': [],
'float_fields': [],
'integer_fields': [1],
'boolean_fields': [],
}

_agg_indices = {
'list_sum_indices': _data_model_meta['integer_fields'],
}

@um.pre_metrics_init
def __init__(self, **kwargs):
super(PagesCreated, self).__init__(**kwargs)

@staticmethod
def header():
return ['user_id', 'pages_created']

@um.UserMetric.pre_process_metric_call
def process(self, users, **kwargs):

# Process results
args = self._pack_params()
self._results = mpw.build_thread_pool(users, _process_help,
self.k_, args)
return self


def _process_help(args):
""" Used by Threshold::process() for forking.
Should not be called externally. """

# Unpack args
users = args[0]
state = args[1]

metric_params = um.UserMetric._unpack_params(state)

if metric_params.log_:
logging.info(__name__ + ' :: Processing pages created data ' +
'(%s users) by user... (PID = %s)' % (
len(users), os.getpid()))
logging.info(__name__ + ' :: ' + str(metric_params))

# only proceed if there is user data
if not len(users):
return []

results = list()
dropped_users = 0
umpd_obj = UMP_MAP[metric_params.group](users, metric_params)
for t in umpd_obj:
uid = long(t.user)
try:
count = query_mod.pages_created_query(uid,
metric_params.project,
metric_params)
print count
except query_mod.UMQueryCallError:
dropped_users += 1
continue

try:
results.append((str(uid), count[0][0]))
except TypeError:
dropped_users += 1

if metric_params.log_:
logging.info(__name__ + '::Processed PID = %s. '
'Dropped users = %s.' % (
os.getpid(), str(dropped_users)))

return results


# ==========================
# DEFINE METRIC AGGREGATORS
# ==========================

# TODO - add sum, median, mean, min, and max aggregators
32 changes: 32 additions & 0 deletions user_metrics/query/query_calls_sql.py
Expand Up @@ -702,6 +702,26 @@ def get_latest_user_activity(users, project, args):
get_latest_user_activity.__query_name__ = 'get_latest_user_activity'


@query_method_deco
def pages_created_query(uid, project, args):
"""
Returns pages created by user with user ID "uid"
"""

query = query_store[pages_created_query.__query_name__]

ns_cond = format_namespace(deepcopy(args.namespace))
query = sub_tokens(query, where=ns_cond)

params = {
'user' : int(uid[0]),
'start' : str(args.datetime_start),
'end' : str(args.datetime_end)
}
return query, params
pages_created_query.__query_name__ = 'pages_created_query'


# QUERY DEFINITIONS
# #################

Expand Down Expand Up @@ -923,4 +943,16 @@ def get_latest_user_activity(users, project, args):
WHERE rev_user in (<users>)
GROUP BY 1
""",
pages_created_query.__query_name__:
"""
SELECT count(*)
FROM <database>.revision
JOIN <database>.page
ON rev_page = page_id
WHERE rev_parent_id = 0
AND <where>
AND rev_user = %(user)s
AND rev_timestamp > %(start)s
AND rev_timestamp <= %(end)s
""",
}

0 comments on commit 85562e6

Please sign in to comment.