# Imports

In [1]:
from functools import reduce
from pathlib import Path
import time
import datetime
import wmfdata
import pandas as pd
import requests
from wmfdata import hive,spark
from wmfdata.utils import print_err, pd_display_all
from wmfdata.utils import check_kerberos_auth, ensure_list

# Parameters

In [2]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
# METRICS_MONTH_TEXT = '2019-07'
# MEDIAWIKI_HISTORY_SNAPSHOT = '2019-07'

last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)


METRICS_MONTH_TEXT = last_month.strftime("%Y-%m")

MEDIAWIKI_HISTORY_SNAPSHOT = last_month.strftime("%Y-%m")

# Preparation

In [3]:

# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = pd.Period(METRICS_MONTH_TEXT)
date_params = {
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_month": str(metrics_month),
    "metrics_month_start": str(metrics_month.start_time), 
    "metrics_month_first_day": str(metrics_month.asfreq("D", how="start")),
    "metrics_month_end": str((metrics_month + 1).start_time),
    "metrics_month_last_day": str(metrics_month.asfreq("D", how="end")),
    "api_metrics_month_first_day": metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    "api_metrics_month_day_after": (metrics_month + 1).asfreq("D", how="start").strftime("%Y%m%d"),
    "metrics_prev_month": str(metrics_month - 1),
    "retention_cohort": str(metrics_month - 2)
}

# Load any previous results
try:
    old_metrics = (
        pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
        .set_index("month")
    )
except FileNotFoundError:
    old_metrics = None
    
def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

In [4]:
date_params

{'mediawiki_history_snapshot': '2023-03',
 'metrics_month': '2023-03',
 'metrics_month_start': '2023-03-01 00:00:00',
 'metrics_month_first_day': '2023-03-01',
 'metrics_month_end': '2023-04-01 00:00:00',
 'metrics_month_last_day': '2023-03-31',
 'api_metrics_month_first_day': '20230301',
 'api_metrics_month_day_after': '20230401',
 'metrics_prev_month': '2023-02',
 'retention_cohort': '2023-01'}

# MariaDB and Hive query metrics

In [5]:
queries = {
    "active_editors": {
        "file": "queries/active_editors.sql",
        "engine": "hive"
    },
    "edits": {
        "file": "queries/edits.hql",
        "engine": "hive"
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql",
         "engine": "hive"
    },
    "global_south_edits_editors": {
        "file": "queries/global_south_edits_editors.hql",
        "engine": "hive"
    },
    "mobile-heavy_edits_editors": {
        "file": "queries/mobile-heavy_edits_editors.hql",
        "engine": "hive"
    },
    "mobile-heavy_new_editor_retention": {
        "file": "queries/mobile-heavy_new_editor_retention.hql",
        "engine": "hive"
    },
    "global_south_new_editor_retention": {
        "file": "queries/global_south_new_editor_retention.hql",
        "engine": "hive"
    }
}


for key, val in queries.items():
    query = prepare_query(val["file"])
    engine = val["engine"]
    print_err("Running {} on {}...".format(key, engine))
    
    if engine == "mariadb":
        result = mariadb.run(query)
    elif engine == "hive":
        result = spark.run(query)
    else:
        raise ValueError("Unknown engine specified.") 
    
    result = result.assign(month=lambda df: pd.to_datetime(df["month"]))
    val["result"] = result

Running active_editors on hive...



SPARK_HOME: /usr/lib/spark3
Using Hadoop client lib jars at 3.2.0, provided by Spark.
PYSPARK_PYTHON=/opt/conda-analytics/bin/python3


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/17 21:22:46 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/04/17 21:22:58 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
23/04/17 21:23:11 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
Running edits on hive...                                                        





23/04/17 21:23:49 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
Running new_editor_retention on hive...                                         





Running global_south_edits_editors on hive...                                   





Running mobile-heavy_edits_editors on hive...                                   





Running mobile-heavy_new_editor_retention on hive...                            





Running global_south_new_editor_retention on hive...                            





                                                                                

# Content metrics via API

In [34]:
NEW_PAGES_API = (
    # Replaces "https://wikimedia.org/api/rest_v1/metrics/" due to https://phabricator.wikimedia.org/P8605
    #"http://aqs1004.eqiad.wmnet:7232/analytics.wikimedia.org/v1/" 
    "https://wikimedia.org/api/rest_v1/metrics/"
    "edited-pages/new/{project}/all-editor-types/{page_type}/monthly/{start}/{end}"
)

headers = {
    "User-Agent": "https://github.com/wikimedia-research/Editing-movement-metrics (bot)"
}

# Create container for results
api_results = []

def get_new_pages(
    project="all-projects",
    page_type="content",
    start= date_params["api_metrics_month_first_day"],
    end= date_params["api_metrics_month_day_after"]
):
    url = NEW_PAGES_API.format(
        project = project,
        page_type = page_type,
        start = start,
        end = end
    )
    
    r = requests.get(url, headers=headers)
    data = r.json()["items"][0]["results"]
    frame = pd.DataFrame(data)
    frame["timestamp"] = pd.to_datetime(frame["timestamp"]).dt.tz_localize(None)
    frame = frame.rename(columns={"timestamp": "month"})
    return frame

In [35]:
project="all-projects",
page_type="content"
start=date_params["api_metrics_month_first_day"]
end=date_params["api_metrics_month_day_after"]

In [36]:
 url = NEW_PAGES_API.format(
        project = project,
        page_type = page_type,
        start = start,
        end = end
    )

## Total

In [39]:
total_new = get_new_pages().rename(columns={"new_pages": "net_new_content_pages"})
api_results.append(total_new)

response: <Response [200]>
data: [{'timestamp': '2023-03-01T00:00:00.000Z', 'new_pages': 1736767}]
frame:                   timestamp  new_pages
0  2023-03-01T00:00:00.000Z    1736767
frame cleaned up:        month  new_pages
0 2023-03-01    1736767
total_new:        month  net_new_content_pages
0 2023-03-01                1736767


## Wikidata

In [48]:
new_wd = get_new_pages(
    project="wikidata.org"
).rename(columns={
    "new_pages": "net_new_Wikidata_entities"
})
api_results.append(new_wd)

response: <Response [200]>
data: [{'timestamp': '2023-03-01T00:00:00.000Z', 'new_pages': 416195}]
frame:                   timestamp  new_pages
0  2023-03-01T00:00:00.000Z     416195
frame cleaned up:        month  new_pages
0 2023-03-01     416195


## Commons

In [49]:
new_commons = get_new_pages(
    project="commons.wikimedia.org"
).rename(columns={
    "new_pages": "net_new_Commons_content_pages"
})
api_results.append(new_commons)

response: <Response [200]>
data: [{'timestamp': '2023-03-01T00:00:00.000Z', 'new_pages': 802669}]
frame:                   timestamp  new_pages
0  2023-03-01T00:00:00.000Z     802669
frame cleaned up:        month  new_pages
0 2023-03-01     802669


## Wikipedias

In [50]:
# Get a list of project URLs (each one in a 1-tuple)
wp_domains_query_results = spark.run("""
select domain_name
from canonical_data.wikis
where database_group = "wikipedia"
""")

# Query the API for each project and append records to a list
#results = []
#n = len(wp_domains)

                                                                                

In [51]:
wp_domains = [tuple(x) for x in wp_domains_query_results.values]

# Query the API for each project and append records to a list
results = []
n = len(wp_domains)

In [None]:
for idx, val in enumerate(wp_domains):
    domain = val[0]
    
    if idx % 10 == 0:
        msg = "Now on the {}th project of {} ({})"
        print_err(msg.format(idx, n, domain))
        
    frame = get_new_pages(project=domain).reset_index()
    frame["project"] = domain
    records = frame.to_dict("records")
    results.extend(records)
    
    # Sleep 20 milliseconds
    time.sleep(0.02)

# Turn the big list of records into a data frame
new_per_wp = pd.DataFrame(results)

# Sum across projects to get new Wikipedia articles per month
new_wp = new_per_wp.groupby("month").agg(
    {"new_pages": "sum"}
).rename(columns={"new_pages": "net_new_Wikipedia_articles"}).reset_index()

api_results.append(new_wp)

In [53]:
# Strip timezones returned by API so our month columns merge nicely
for df in api_results:
    df["month"] = df["month"].dt.tz_localize(None)

# Combining and saving metrics

In [54]:
queries["edits"]["result"] = queries["edits"]["result"].rename({"data_edits": "wikidata_edits"}, axis=1)

In [55]:
# Assemble list of result dataframes
results = [val["result"] for _, val in queries.items()]
results.extend(api_results)

In [56]:
# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), results)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month").sort_index()

if old_metrics is None:
    metrics = new_metrics
else:
    metrics = new_metrics.combine_first(old_metrics)
    
pd_display_all(metrics.tail())

Unnamed: 0_level_0,active_editors,anonymous_edits,global_south_active_editors,global_south_edits,global_south_new_editor_retention,global_south_nonbot_edits,mobile-heavy_wiki_active_editors,mobile-heavy_wiki_edits,mobile-heavy_wiki_new_editor_retention,mobile-heavy_wiki_nonbot_edits,mobile_edits,net_new_Commons_content_pages,net_new_Wikidata_entities,net_new_Wikipedia_articles,net_new_content_pages,new_active_editors,new_editor_retention,non_anonymous_edits,other_nonbot_edits,returning_active_editors,revert_rate,total_edits,uploads,wikidata_edits
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2022-11-01,88458.0,2037337.0,18960.0,3931215.0,,2432946.0,4881.0,936197.0,0.050783,516474.0,1986573.0,1004268.0,460797.0,205291.0,1925293.0,16189.0,0.079863,44458168.0,10186389.0,72269.0,0.049983,46495714.0,1085844.0,19515262.0
2022-12-01,88831.0,2015566.0,20063.0,4444113.0,,2491803.0,5236.0,1281700.0,0.041671,539969.0,2028754.0,785030.0,518401.0,229919.0,1733876.0,15882.0,0.070243,43788318.0,10601038.0,72949.0,0.044989,45804026.0,839212.0,17493716.0
2023-01-01,93997.0,2136383.0,20808.0,4040347.0,,2754632.0,5531.0,1473586.0,0.054593,563487.0,2183828.0,763067.0,668528.0,198062.0,1997150.0,18061.0,0.061386,49276599.0,11886151.0,75936.0,0.040043,51413312.0,821499.0,22817468.0
2023-02-01,87926.0,1848805.0,19642.0,3942811.0,,2533307.0,5091.0,1149044.0,0.059662,496434.0,1873950.0,601988.0,385735.0,218166.0,1444956.0,16498.0,0.060853,42097626.0,10394629.0,71428.0,0.052052,43946614.0,660012.0,19303690.0
2023-03-01,93506.0,2030811.0,20763.0,4015257.0,,2752975.0,5453.0,1585639.0,0.053838,568648.0,2077931.0,802669.0,416195.0,217410.0,1736767.0,18364.0,0.081447,47736130.0,11022123.0,75142.0,0.067389,49767168.0,856120.0,22102794.0


In [57]:
metrics.to_csv(FILENAME, sep="\t")