## Setup

In [15]:
import datetime
from pathlib import Path
import time

import pandas as pd
import requests

import wmfdata as wmf
from wmfdata.utils import print_err, pd_display_all

import src.content as content

from src.utils import load_metric_file

In [16]:
# All the date parameters are determined from these two:
# * metrics_month_text (e.g. "2023-08"): the month metrics are generated for
# * mediawiki_history_snapshot (e.g. "2023-08"): the version of mediawiki_history
#   used to generate the editing metrics. This should generally be the latest available,
#   even if you are not generating metrics for the latest month.
#
# Both key parameters are generated automatically by assuming they are the last completed
# month, but you can manually set them to different values if necessary.
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

metrics_month_text = last_month.strftime("%Y-%m")
mediawiki_history_snapshot = metrics_month_text


# Convert our two date parameters to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = pd.Period(metrics_month_text)

date_params = {
    "api_metrics_month_first_day": metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    "api_metrics_month_day_after": (metrics_month + 1).asfreq("D", how="start").strftime("%Y%m%d"),
    "mediawiki_history_snapshot": mediawiki_history_snapshot,
    "metrics_cur_month": metrics_month.month,
    "metrics_month": str(metrics_month),
    "metrics_month_first_day": str(metrics_month.asfreq("D", how="start")),
    "metrics_month_end": str((metrics_month + 1).start_time),
    "metrics_month_last_day": str(metrics_month.asfreq("D", how="end")),
    "metrics_month_start": str(metrics_month.start_time),
    "metrics_next_month_first_day": str((metrics_month + 1).asfreq("D", how="start")),
    "metrics_prev_month": str(metrics_month - 1),
    "metrics_year": metrics_month.year,
    "retention_cohort": str(metrics_month - 2)
}

def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

In [17]:
class MetricSet:
    """
    A MetricSet is a group of monthly metrics that is saved to a single file and
    which is generated by one or more queries.
    
    Class assumptions:
    * Each query contains a "month" column which Pandas can parse into a date.
    * The column names used in the queries are unique across queries.
    
    A note on the date formats:
    The month column is saved to the file as the ISO-8601 date of the start of the
    month (e.g. 2020-06-01). However, in Python we handle them not as Datetimes but as
    Periods, since this makes it easy to write code (e.g. utils.calc_rpt) which works
    both with the normal monthly as well as quarterly aggregates.
    """
    
    def __init__(self, filename, queries):
        self.filename = filename
        self.queries = queries
        self.load_data()

    def load_data(self):
        try:
            self.data = load_metric_file(self.filename)
        except FileNotFoundError:
            self.data = pd.DataFrame()

    def add_data(self, new_data):
        """
        Takes a Pandas data frame with a date index giving the month and one or more
        columns of metrics.
        """
        # Our policy is to avoid altering previously-generated data (which can happen
        # because some of our data sources regenerate history every month). If you do want
        # to regenerate some data, manually delete it from the data file.
        
        # Store the original order of columns from self.data
        original_order = self.data.columns.tolist()
     
        self.data = self.data.combine_first(new_data)[original_order]
        
    def run_queries(self, cleanup_function=None):
        for key, val in self.queries.items():
            query = prepare_query(val["file"])
            print_err(f"Running {key} ")

            result = wmf.spark.run(query)

            result = (
                result
                .assign(month=lambda df: pd.to_datetime(df["month"]))
                .set_index("month")
                # This dataframe will usually have only a single row, so
                # Pandas will not be able to infer the frequency of the period
                .to_period("M")
            
               
            )
            
            if cleanup_function:
                result = cleanup_function(result)
            self.add_data(result)

    def save_data(self):
        self.data.to_timestamp().to_csv(self.filename, sep="\t")

## Editing metrics

In [18]:
editing_queries = {
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/edits.sql"
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.sql"
    },
    "mobile-heavy_edits_editors": {
        "file": "queries/mobile-heavy_edits_editors.sql"
    },
    "mobile-heavy_new_editor_retention": {
        "file": "queries/mobile-heavy_new_editor_retention.sql"
    }
}

editing_metrics = MetricSet("metrics/editing_metrics.tsv", editing_queries)
editing_metrics.run_queries()

Running active_editors 





Running edits                                                                   





Running new_editor_retention                                                    





Running mobile-heavy_edits_editors                                              





Running mobile-heavy_new_editor_retention                                       





                                                                                

### Content metrics via API

In [19]:
NEW_PAGES_API = (
    "https://wikimedia.org/api/rest_v1/metrics/"
    "edited-pages/new/{project}/all-editor-types/{page_type}/monthly/{start}/{end}"
)

headers = {
    "User-Agent": "https://github.com/wikimedia-research/movement-metrics (bot)"
}

api_results = []

def get_new_pages(
    project="all-projects",
    page_type="content",
    start=date_params["api_metrics_month_first_day"],
    end=date_params["api_metrics_month_day_after"]
):
    url = NEW_PAGES_API.format(
        project=project,
        page_type=page_type,
        start=start,
        end=end
    )
    
    r = requests.get(url, headers=headers)

    if r.status_code == 404:
        return None  # Returns None if the status code is 404
    else:
        data = r.json()["items"][0]["results"]
        frame = pd.DataFrame(data)
        frame["timestamp"] = pd.to_datetime(frame["timestamp"]).dt.tz_localize(None)
        frame = (
            frame
            .rename(columns={"timestamp": "month"})
            .set_index("month")
            .to_period("M")
        )

        return frame

In [20]:
total_new = (
    get_new_pages()
    .rename(columns={"new_pages": "net_new_content_pages"})
)

editing_metrics.add_data(total_new)


wikidata_new = (
    get_new_pages(project="wikidata.org")
    .rename(columns={"new_pages": "net_new_Wikidata_entities"})
)

editing_metrics.add_data(wikidata_new)


commons_new = (
    get_new_pages(project="commons.wikimedia.org")
    .rename(columns={"new_pages": "net_new_Commons_content_pages"})
)

editing_metrics.add_data(commons_new)

In [21]:
wp_domains = wmf.spark.run("""
    SELECT domain_name
    FROM canonical_data.wikis
    WHERE database_group = "wikipedia"
""")["domain_name"]

results = []
n = len(wp_domains)

for i, domain in enumerate(wp_domains):
    p = i + 1
    if p % 50 == 0:
        print(f"Now on project {p} of {n} ({domain})")

    frame = get_new_pages(project=domain)
    
    if frame is not None:
        frame["project"] = domain
        results.append(frame)
    
    # Be polite to the API
    time.sleep(0.02)

if results:
    new_per_wp = pd.concat(results)

    # Sum across projects to get new Wikipedia articles per month
    wikipedia_new = (
        new_per_wp
        .groupby("month")
        .agg({"new_pages": "sum"})
        .rename(columns={"new_pages": "net_new_Wikipedia_articles"})
    )

    editing_metrics.add_data(wikipedia_new)
else:
    print("No data to aggregate.")

editing_metrics.add_data(wikipedia_new)

Now on project 50 of 336 (ceb.wikipedia.org)
Now on project 100 of 336 (gd.wikipedia.org)
Now on project 150 of 336 (ki.wikipedia.org)
Now on project 200 of 336 (mt.wikipedia.org)
Now on project 250 of 336 (roa-rup.wikipedia.org)
Now on project 300 of 336 (to.wikipedia.org)


In [22]:
editing_metrics.save_data()

## Readers metrics

In [23]:
readers_queries = {
    "pageviews": {
        "file": "queries/pageviews.sql"
    },
    "automated_pageviews": {
        "file": "queries/automated_pageviews.sql"
    },
    "page_previews": {
        "file": "queries/page_previews.sql"
    },
    "unique_devices": {
        "file": "queries/unique_devices.sql"
    }
}

readers_metrics = MetricSet("metrics/readers_metrics.tsv", readers_queries)
readers_metrics.run_queries()
readers_metrics.data = (
    readers_metrics.data
    .assign(interactions=lambda df: df["previews_seen"] + df["total_pageview"])
)
readers_metrics.save_data()

Running pageviews 





Running automated_pageviews                                                     





Running page_previews                                                           





Running unique_devices                                                          





In [24]:
regional_unique_devices_queries = {
    "regional_unique_devices": {
        "file": "queries/regional_unique_devices.sql"
    }
}

regional_unique_devices = MetricSet("metrics/regional_unique_devices.tsv", regional_unique_devices_queries)
regional_unique_devices.run_queries()
regional_unique_devices.save_data()

Running regional_unique_devices 





                                                                                

## Editing diversity metrics

In [25]:
editing_diversity_queries = {
    "global_south_edits_editors": {
        "file": "queries/global_south_edits_editors.sql"
    },
    "global_south_new_editor_retention": {
        "file": "queries/global_south_new_editor_retention.sql"
    },
    "global_south_net_new_content": {
        "file": "queries/global_south_net_new_content.sql"
    },
    "global_south_wikidata_entities": {
        "file": "queries/global_south_net_new_wikidata.sql"
    },
    "global_north_edits_editors": {
        "file": "queries/global_north_edits_editors.sql"
    },
    "global_north_new_editor_retention": {
        "file": "queries/global_north_new_editor_retention.sql"
    },
    "global_north_net_new_content": {
        "file": "queries/global_north_net_new_content.sql"
    },
    "global_north_wikidata_entities": {
        "file": "queries/global_north_net_new_wikidata.sql"
    }
}

editing_diversity_metrics = MetricSet("metrics/editing_diversity_metrics.tsv", editing_diversity_queries)
editing_diversity_metrics.run_queries()
editing_diversity_metrics.save_data()

Running global_south_edits_editors 





Running global_south_new_editor_retention                                       





Running global_south_net_new_content 





Running global_south_wikidata_entities                                          





Running global_north_edits_editors                                              





Running global_north_new_editor_retention                                       





Running global_north_net_new_content 





Running global_north_wikidata_entities                                          





                                                                                

In [26]:
readers_diversity_queries = {
    "global_south_pageviews": {
        "file": "queries/global_south_pageviews.sql"
    },
    "global_south_previews": {
        "file": "queries/global_south_previews.sql"
    },
    "global_north_previews": {
        "file": "queries/global_north_previews.sql"
    },
     "global_north_pageviews": {
        "file": "queries/global_north_pageviews.sql"
    }
}

readers_diversity_metrics = MetricSet("metrics/readers_diversity_metrics.tsv", readers_diversity_queries)
readers_diversity_metrics.run_queries()

readers_diversity_metrics.data = (
    readers_diversity_metrics.data
    .assign(
        gs_interactions=lambda df: df["gs_previews"] + df["gs_pageviews"],
        gn_interactions=lambda df: df["gn_previews"] + df["gn_pageviews"]
    )
)

readers_diversity_metrics.save_data()

Running global_south_pageviews 





Running global_south_previews                                                   





Running global_north_previews                                                   





Running global_north_pageviews                                                  





                                                                                

# Content gap metrics

In [27]:
import importlib
importlib.reload(content)

#     
#    The content gap metrics are fetched from the csv dumps in the links presented in data_dict.  
#    The dumps are updated around the 23rd of every month and since historical data must remain unchanged, the new data fetched from the 
#    dumps are appended to the tsv's which contain the historical snapshot.  Every new snapshot contains partial data of the month it is published in
#    so the data from the the last month in the dataset is dropped in the clean up functions.
   

content_gap_queries = {
    "content_gap": {
        "file": "queries/content_gap.sql"
    }
}

content_gap_metrics = MetricSet("metrics/content_gap_data_metrics.tsv", content_gap_queries)
content_gap_metrics.run_queries(cleanup_function=content.process_quality_data)

content_gap_metrics.data = content.calculate_mom(content_gap_metrics.data)
content_gap_metrics.save_data()

Running content_gap 





                                                                                