# Import

In [8]:
from functools import reduce
from pathlib import Path
import time
import datetime

import pandas as pd
import requests
from wmfdata import hive, spark
from wmfdata.utils import print_err, pd_display_all

import warnings
warnings.filterwarnings('ignore')


# Parameters

In [9]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/readers_metrics.tsv"

# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

METRICS_MONTH_TEXT = last_month.strftime("%Y-%m")
MEDIAWIKI_HISTORY_SNAPSHOT = last_month.strftime("%Y-%m")



In [10]:
datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

datetime.date(2023, 8, 31)

# Preparation

In [11]:
# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = METRICS_MONTH_TEXT
date_params = {
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_month": str(metrics_month),
    #"metrics_prev_month": str(metrics_month - 1),
    #"metrics_month_start": str(metrics_month.start_time), 
    "metrics_month_first_day": str((datetime.date.today()- datetime.timedelta(days=31)).replace(day=1)),
    #"metrics_month_end": str(last_month),
    "metrics_month_last_day": str(last_month),
    "metrics_year": last_month.year,
    "metrics_cur_month" : last_month.month
}

# Load any previous results
try:
    old_metrics = (
        pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
        .set_index("month")
    )
except FileNotFoundError:
    old_metrics = None
    
def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

In [12]:
date_params

{'mediawiki_history_snapshot': '2023-08',
 'metrics_month': '2023-08',
 'metrics_month_first_day': '2023-08-01',
 'metrics_month_last_day': '2023-08-31',
 'metrics_year': 2023,
 'metrics_cur_month': 8}

# MariaDB and Hive query metrics


In [13]:
# Code to Suppress PySpark Warning messages
from warnings import filterwarnings
filterwarnings("ignore")

In [14]:
queries = {
    "pageviews": {
        "file": "queries/pageviews.hql",
        "engine": "hive"
    },
    
    "automated_pageviews": {
        "file": "queries/automated_pageviews.hql",
        "engine": "hive"
    },
    "page_previews": {
        "file": "queries/page_previews.hql",
        "engine": "hive"
    },
    "unique_devices": {
        "file": "queries/unique_devices.hql",
        "engine": "hive"
    }


# Removing since global south metrics are calculated in notebook 02b-diversity-calculation.ipynb
   # ,
   # "global_south_pageviews": {
   #     "file": "queries/global_south_pageviews.hql",
   #     "engine": "hive"
   # },
    
   # "global_south_previews": {
   #     "file": "queries/global_south_previews.hql",
   #     "engine": "hive"
   # }
    
# Removing since mobile-heavy metrics are no longer tracked/reported on in the key product metrics presentations
    #"mobile-heavy_pageviews": {
    #    "file": "queries/mobile_heavy_pageviews.hql",
    #    "engine": "hive" }, 
    #"mobile-heavy_previews": {
    #   "file": "queries/mobile_heavy_previews.hql",
    #   "engine": "hive" }
    
       
}


for key, val in queries.items():
    query = prepare_query(val["file"])
    engine = val["engine"]
    print_err("Running {} on {}...".format(key, engine))
    
    if engine == "mariadb":
        result = mariadb.run(query)
    elif engine == "hive":
        result = spark.run(query)
    else:
        raise ValueError("Unknown engine specified.") 
    
    result = result.assign(month=lambda df: pd.to_datetime(df["month"]))
    val["result"] = result

Running pageviews on hive...



SPARK_HOME: /usr/lib/spark3
Using Hadoop client lib jars at 3.2.0, provided by Spark.
PYSPARK_PYTHON=/opt/conda-analytics/bin/python3


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/21 23:03:21 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/09/21 23:03:34 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
23/09/21 23:03:39 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
Running automated_pageviews on hive...                                          





Running page_previews on hive...                                                





Running unique_devices on hive...                                               





# Combining and saving metrics

In [15]:
# Assemble list of result dataframes
results = [val["result"] for _, val in queries.items()]

# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), results)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month").sort_index()

# Add Metrics for Interactions
new_metrics['interactions'] = new_metrics.apply(lambda x: x['previews_seen'] + x['total_pageview'], axis=1)


In [16]:
if old_metrics is None:
    metrics = new_metrics
else:
    metrics = new_metrics.combine_first(old_metrics)
    
pd_display_all(metrics.tail(10))

Unnamed: 0_level_0,automated_pageviews,desktop,interactions,mobileweb,previews_seen,total_pageview,unique_devices
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-11-01,3125762000.0,6639885018,21036940000.0,12391464131,1725951000.0,19310984429,1602483000.0
2022-12-01,1514616000.0,6245363227,18534310000.0,10336993739,1645032000.0,16889273891,1580337000.0
2023-01-01,2058892000.0,6655879078,20529100000.0,11696154738,1853753000.0,18675346647,1623196000.0
2023-02-01,2484894000.0,5809895389,18961410000.0,11232689898,1643540000.0,17317864773,1556624000.0
2023-03-01,2851128000.0,6554145894,21108270000.0,12446354588,1808851000.0,19299418022,1666366000.0
2023-04-01,1835848000.0,5903055420,18578360000.0,10744497885,1635800000.0,16942562523,1653133000.0
2023-05-01,2788857000.0,6136981565,20868260000.0,12693678053,1738922000.0,19129336129,1596438000.0
2023-06-01,2535726000.0,5642808989,19569770000.0,12093286214,1548887000.0,18020883276,1522910000.0
2023-07-01,2765634000.0,5557997700,20108820000.0,12691626287,1547944000.0,18560871345,1515395000.0
2023-08-01,2979971000.0,5671123170,20544790000.0,12970986524,1590671000.0,18954115181,1528966000.0


In [17]:
metrics.to_csv(FILENAME, sep="\t")