diff --git a/user_metrics/api/engine/request_meta.py b/user_metrics/api/engine/request_meta.py index 8b3d198..bbcb54c 100644 --- a/user_metrics/api/engine/request_meta.py +++ b/user_metrics/api/engine/request_meta.py @@ -280,6 +280,7 @@ def __new__(cls): varMapping('look_ahead', 'look_ahead')], 'survival': common_params, + 'pages_created': common_params, 'threshold': common_params + [varMapping('n', 'n')], 'time_to_threshold': common_params + [varMapping('threshold_type', 'threshold_type_class')], @@ -316,6 +317,7 @@ def map(request_meta): from user_metrics.metrics.namespace_of_edits import NamespaceEdits, \ namespace_edits_sum from user_metrics.metrics.live_account import LiveAccount, live_accounts_agg +from user_metrics.metrics.pages_created import PagesCreated # Registered metrics types @@ -330,6 +332,7 @@ def map(request_meta): 'edit_rate': EditRate, 'namespace_edits': NamespaceEdits, 'live_account': LiveAccount, + 'pages_created': PagesCreated, } # @TODO: let metric types handle this mapping themselves and obsolete this diff --git a/user_metrics/metrics/pages_created.py b/user_metrics/metrics/pages_created.py new file mode 100644 index 0000000..dcedfb1 --- /dev/null +++ b/user_metrics/metrics/pages_created.py @@ -0,0 +1,122 @@ + +__author__ = {'Evan Rosen': 'erosen@wikimedia.org'} +__date__ = "April 14th, 2013" +__license__ = "GPL (version 2 or later)" + +from user_metrics.config import logging + +import os +import user_metrics.utils.multiprocessing_wrapper as mpw +import user_metric as um +from user_metrics.etl.aggregator import decorator_builder, boolean_rate +from user_metrics.metrics import query_mod +from user_metrics.metrics.users import UMP_MAP + + +class PagesCreated(um.UserMetric): + """ + Skeleton class for "PagesCreated" metric: + + `https://meta.wikimedia.org/wiki/Research:Metrics/pages_created` + + This metric computes how often a user has been reverted + + As a UserMetric type this class utilizes the process() function + attribute to produce an internal list of metrics by + user handle (typically ID but user names may also be specified). + The execution of process() produces a nested list that + stores in each element: + + * User ID + * Number of pages created by user + """ + + # Structure that defines parameters for Threshold class + _param_types = { + 'init': {}, + 'process': {} + } + + # Define the metrics data model meta + _data_model_meta = { + 'id_fields': [0], + 'date_fields': [], + 'float_fields': [], + 'integer_fields': [1], + 'boolean_fields': [], + } + + _agg_indices = { + 'list_sum_indices': _data_model_meta['integer_fields'], + } + + @um.pre_metrics_init + def __init__(self, **kwargs): + super(PagesCreated, self).__init__(**kwargs) + + @staticmethod + def header(): + return ['user_id', 'pages_created'] + + @um.UserMetric.pre_process_metric_call + def process(self, users, **kwargs): + + # Process results + args = self._pack_params() + self._results = mpw.build_thread_pool(users, _process_help, + self.k_, args) + return self + + +def _process_help(args): + """ Used by Threshold::process() for forking. + Should not be called externally. """ + + # Unpack args + users = args[0] + state = args[1] + + metric_params = um.UserMetric._unpack_params(state) + + if metric_params.log_: + logging.info(__name__ + ' :: Processing pages created data ' + + '(%s users) by user... (PID = %s)' % ( + len(users), os.getpid())) + logging.info(__name__ + ' :: ' + str(metric_params)) + + # only proceed if there is user data + if not len(users): + return [] + + results = list() + dropped_users = 0 + umpd_obj = UMP_MAP[metric_params.group](users, metric_params) + for t in umpd_obj: + uid = long(t.user) + try: + count = query_mod.pages_created_query(uid, + metric_params.project, + metric_params) + print count + except query_mod.UMQueryCallError: + dropped_users += 1 + continue + + try: + results.append((str(uid), count[0][0])) + except TypeError: + dropped_users += 1 + + if metric_params.log_: + logging.info(__name__ + '::Processed PID = %s. ' + 'Dropped users = %s.' % ( + os.getpid(), str(dropped_users))) + + return results + + +# ========================== +# DEFINE METRIC AGGREGATORS +# ========================== + +# TODO - add sum, median, mean, min, and max aggregators diff --git a/user_metrics/query/query_calls_sql.py b/user_metrics/query/query_calls_sql.py index 4369f9d..bf87dd4 100644 --- a/user_metrics/query/query_calls_sql.py +++ b/user_metrics/query/query_calls_sql.py @@ -702,6 +702,26 @@ def get_latest_user_activity(users, project, args): get_latest_user_activity.__query_name__ = 'get_latest_user_activity' +@query_method_deco +def pages_created_query(uid, project, args): + """ + Returns pages created by user with user ID "uid" + """ + + query = query_store[pages_created_query.__query_name__] + + ns_cond = format_namespace(deepcopy(args.namespace)) + query = sub_tokens(query, where=ns_cond) + + params = { + 'user' : int(uid[0]), + 'start' : str(args.datetime_start), + 'end' : str(args.datetime_end) + } + return query, params +pages_created_query.__query_name__ = 'pages_created_query' + + # QUERY DEFINITIONS # ################# @@ -923,4 +943,16 @@ def get_latest_user_activity(users, project, args): WHERE rev_user in () GROUP BY 1 """, + pages_created_query.__query_name__: + """ + SELECT count(*) + FROM .revision + JOIN .page + ON rev_page = page_id + WHERE rev_parent_id = 0 + AND + AND rev_user = %(user)s + AND rev_timestamp > %(start)s + AND rev_timestamp <= %(end)s + """, }