# Import

In [1]:
from functools import reduce
from pathlib import Path
import time
import datetime

import pandas as pd
import requests
from wmfdata import hive, spark
from wmfdata.utils import print_err, pd_display_all

# Parameters

In [2]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

METRICS_MONTH_TEXT = last_month.strftime("%Y-%m")
MEDIAWIKI_HISTORY_SNAPSHOT = last_month.strftime("%Y-%m")



# Preparation

In [3]:
# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = METRICS_MONTH_TEXT
date_params = {
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_month": str(metrics_month),
    #"metrics_prev_month": str(metrics_month - 1),
    #"metrics_month_start": str(metrics_month.start_time), 
    "metrics_month_first_day": str((datetime.date.today()- datetime.timedelta(days=31)).replace(day=1)),
    #"metrics_month_end": str(last_month),
    "metrics_month_last_day": str(last_month),
    "metrics_year": last_month.year,
    "metrics_cur_month" : last_month.month
}

# Load any previous results
try:
    old_metrics = (
        pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
        .set_index("month")
    )
except FileNotFoundError:
    old_metrics = None
    
def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

# MariaDB and Hive query metrics


In [7]:
# Code to Suppress PySpark Warning messages
import warnings
warnings.filterwarnings('ignore') 

In [8]:
# Code to Suppress PySpark Warning messages

from IPython.display import HTML
HTML('''<script>
var code_show_err = false; 
var code_toggle_err = function() {
 var stderrNodes = document.querySelectorAll('[data-mime-type="application/vnd.jupyter.stderr"]')
 var stderr = Array.from(stderrNodes)
 if (code_show_err){
     stderr.forEach(ele => ele.style.display = 'block');
 } else {
     stderr.forEach(ele => ele.style.display = 'none');
 }
 code_show_err = !code_show_err
} 
document.addEventListener('DOMContentLoaded', code_toggle_err);
</script>
To toggle on/off output_stderr, click <a onclick="javascript:code_toggle_err()">here</a>.''')

In [9]:
queries = {
    "pageviews": {
        "file": "queries/pageviews.hql",
        "engine": "hive"
    },
    
    "automated_pageviews": {
        "file": "queries/automated_pageviews.hql",
        "engine": "hive"
    },
    "page_previews": {
        "file": "queries/page_previews.hql",
        "engine": "hive"
    },
    "unique_devices": {
        "file": "queries/unique_devices.hql",
        "engine": "hive"
    },
    "global_south_pageviews": {
        "file": "queries/global_south_pageviews.hql",
        "engine": "hive"
    },
    
    "global_south_previews": {
        "file": "queries/global_south_previews.hql",
        "engine": "hive"
    }
    
# Removing since mobile-heavy metrics are no longer tracked/reported on in the key product metrics presentations
    #"mobile-heavy_pageviews": {
    #    "file": "queries/mobile_heavy_pageviews.hql",
    #    "engine": "hive" }, 
    #"mobile-heavy_previews": {
    #   "file": "queries/mobile_heavy_previews.hql",
    #   "engine": "hive" }
    
       
}


for key, val in queries.items():
    query = prepare_query(val["file"])
    engine = val["engine"]
    print_err("Running {} on {}...".format(key, engine))
    
    if engine == "mariadb":
        result = mariadb.run(query)
    elif engine == "hive":
        result = spark.run(query)
    else:
        raise ValueError("Unknown engine specified.") 
    
    result = result.assign(month=lambda df: pd.to_datetime(df["month"]))
    val["result"] = result

Running pageviews on hive...
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
Running automated_pageviews on hive...                                          
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
Running page_previews on hive...                                                
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
Running unique_devices on hive...                                               
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
Running global_south_pageviews on hive...
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
Running global_south_previews on hive...
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

# Combining and saving metrics

In [10]:
# Assemble list of result dataframes
results = [val["result"] for _, val in queries.items()]

# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), results)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month").sort_index()

# Add Metrics for Interactions
new_metrics['interactions'] = new_metrics.apply(lambda x: x['previews_seen'] + x['total_pageview'], axis=1)
new_metrics['gs_interactions'] = new_metrics.apply(lambda x: x['gs_previews'] + x['gs_pageviews'], axis=1)

# Removing since mobile-heavy metrics are no longer tracked/reported on in the key product metrics presentations
# new_metrics['mh_interactions'] = new_metrics.apply(lambda x: x['mh_previews'] + x['mh_pageviews'], axis=1)


In [11]:
if old_metrics is None:
    metrics = new_metrics
else:
    metrics = new_metrics.combine_first(old_metrics)
    
pd_display_all(metrics.tail(10))

Unnamed: 0_level_0,automated_pageviews,desktop,gs_interactions,gs_pageviews,gs_previews,interactions,mh_interactions,mh_pageviews,mh_previews,mobileweb,previews_seen,total_pageview,unique_devices
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-09-01,911629600.0,6293941000.0,4342242000.0,3949492000.0,392749815.0,17823780000.0,821283216.0,788543740.0,32739476.0,9454794000.0,1812547000.0,16011240000.0,1690497000.0
2021-10-01,901158800.0,6513036000.0,4356292000.0,3973495000.0,382796754.0,18360030000.0,,,,9701242000.0,1872864000.0,16487170000.0,1837564000.0
2021-11-01,881050500.0,6420327000.0,4247671000.0,3868070000.0,379601677.0,17768510000.0,,,,9209694000.0,1880481000.0,15888030000.0,1730051000.0
2021-12-01,955616300.0,5997305000.0,4053082000.0,3699154000.0,353928149.0,17629130000.0,,,,9545437000.0,1802476000.0,15826650000.0,1728499000.0
2022-01-01,1003627000.0,6657032000.0,4237581000.0,3860724000.0,376857667.0,19401300000.0,,,,10371240000.0,2058830000.0,17342470000.0,1775062000.0
2022-02-01,979407000.0,6381499000.0,4110316000.0,3767028000.0,343288151.0,17874090000.0,,,,9383266000.0,1833566000.0,16040520000.0,1827453000.0
2022-03-01,1000023000.0,6820776000.0,4407957000.0,4051236000.0,356720322.0,18950020000.0,,,,9870969000.0,1967056000.0,16982960000.0,1957130000.0
2022-04-01,1257069000.0,6517583000.0,4023481000.0,3700706000.0,322775480.0,17930780000.0,,,,9357140000.0,1773847000.0,16156930000.0,1878851000.0
2022-05-01,1491689000.0,6701587000.0,4102891000.0,3771585000.0,331305888.0,18174940000.0,,,,9402734000.0,1788005000.0,16386940000.0,1948874000.0
2022-06-01,1522579000.0,6287650000.0,3923353000.0,3613206000.0,310147238.0,17104960000.0,,,,8918872000.0,1626138000.0,15478820000.0,2056714000.0


In [12]:
metrics.to_csv(FILENAME, sep="\t")