In [2]:
import wmfdata as wmf
import pandas as pd

# Load existing list of grouped projects

This file is the canonical source for which project keys belong to which groups (any keys not found in the file belong to the "other" group). That information needs to be preserved. On the other hand, the human-readable project names are just for the convenience of anyone reading the file, so they don't need to be preserved and will just get in the way.

In [3]:
grouped_projects = (
  pd.read_table("definitions/project-group-assignments.tsv")
  .set_index("project_key")
  .drop("project_name", axis=1)
)

grouped_projects.head()

Unnamed: 0_level_0,project_group
project_key,Unnamed: 1_level_1
abwiki,meaf_wps
acewiki,asia_wps
adywiki,cee_wps
afwiki,meaf_wps
akwiki,meaf_wps


# Update list

Let's pull an up-to-date list of all open content projects so we can update our existing list with new projects, closed projects, and updated names.

In [20]:
# It's likely necessary to update the `canonical_data.wikis` table first.
# See github.com/wikimedia-research/canonical-data/ for more info.

all_projects = wmf.spark.run("""
  SELECT
    database_code as project_key,
    english_name as project_name,
    language_name as language
  FROM
    canonical_data.wikis
  WHERE
    database_group in (
      "commons", "incubator", "foundation", "mediawiki", "meta", "sources",
      "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
      "wikisource", "wikiversity", "wikivoyage", "wiktionary"
    ) AND
    status = "open" AND
    visibility = "public" AND
    editability = "public"
""").set_index("project_key")

print(all_projects.shape)
all_projects.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


(763, 2)


Unnamed: 0_level_0,project_name,language
project_key,Unnamed: 1_level_1,Unnamed: 2_level_1
abwiki,Abkhazian Wikipedia,Abkhazian
acewiki,Achinese Wikipedia,Achinese
adywiki,Adyghe Wikipedia,Adyghe
afwiki,Afrikaans Wikipedia,Afrikaans
afwikibooks,Afrikaans Wikibooks,Afrikaans


In [21]:
projects = pd.merge(
  all_projects, 
  grouped_projects,
  on="project_key",
  # This ensures that keys present only in the old list, which 
  # represent closed or deleted projects, are dropped.
  how="left"
)

projects.head()

Unnamed: 0_level_0,project_name,language,project_group
project_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abwiki,Abkhazian Wikipedia,Abkhazian,meaf_wps
acewiki,Achinese Wikipedia,Achinese,asia_wps
adywiki,Adyghe Wikipedia,Adyghe,cee_wps
afwiki,Afrikaans Wikipedia,Afrikaans,meaf_wps
afwikibooks,Afrikaans Wikibooks,Afrikaans,


In [22]:
# check on number of null values i.e. projects with no assigned group
projects.isna().sum()

project_name       0
language           0
project_group    459
dtype: int64

Even though the projects (wikis) are open and public, there are might some wikis where there is no activity during the last three months. We will filter wikis that do not have at least one active editor (5 content edits in 2 of 3 last months: 10 edits) during the last three months.

In [10]:
active_projects_source = wmf.spark.run("""
WITH user_edits_by_wiki AS (
SELECT 
    wiki_db,
    event_user_text,
    COUNT(*)
FROM wmf.mediawiki_history
WHERE 
    snapshot = "2022-03" AND
    YEAR(event_timestamp) = 2022 AND
    MONTH(event_timestamp) IN (1, 2, 3) AND
    NOT event_user_is_anonymous AND
    SIZE(event_user_is_bot_by) = 0 AND
    page_namespace_is_content_historical AND
    event_entity = "revision" AND
    event_type = "create"
GROUP BY wiki_db, event_user_text
HAVING COUNT(*) > 10)

SELECT wiki_db, COUNT(DISTINCT event_user_text) AS active_editors_count
FROM user_edits_by_wiki
GROUP BY wiki_db
ORDER BY active_editors_count DESC
"""
)

print(active_projects_source.shape)
active_projects_source.head()

(672, 2)


Unnamed: 0,wiki_db,active_editors_count
0,enwiki,38475
1,commonswiki,16835
2,wikidatawiki,16260
3,jawiki,7083
4,dewiki,6340


In [94]:
# projects that are open, live and public from canonical_data.wikis
# and have at least one active editor during the last three months
active_projects = (pd.merge(active_projects_source.wiki_db,
                            projects.reset_index(), how='inner', 
                            left_on='wiki_db', right_on='project_key')
                   .drop(['wiki_db'], axis=1))

print(active_projects.shape)
active_projects.head()

(646, 4)


Unnamed: 0,project_key,project_name,language,project_group
0,enwiki,English Wikipedia,English,enwiki
1,commonswiki,Wikimedia Commons,English,commons
2,wikidatawiki,Wikidata,English,wikidata
3,jawiki,Japanese Wikipedia,Japanese,jawiki
4,dewiki,German Wikipedia,German,dewiki


In [95]:
#reviewing projects that have active editor(s) but not present in canonical_data.wikis
active_projects_source[active_projects_source['wiki_db']
                       .apply(lambda x:True if x not in projects
                              .reset_index().project_key.values else False)]

Unnamed: 0,wiki_db,active_editors_count
123,testwiki,29
126,ruwikimedia,28
160,testwikidatawiki,19
176,outreachwiki,16
192,pawikisource,13
216,plwikimedia,11
241,sewikimedia,10
247,idwikimedia,9
249,betawikiversity,9
294,brwikimedia,7


While most of the above wikis belong to Wikimedia affiliates or test wikis, a couple of wikis are open, public, and editable wikis - they are pawikisource and dtywiki. The issue is documented at https://github.com/wikimedia-research/canonical-data/issues/1. But for now, we will be adding these two wikis manually to our active projects frame.

In [96]:
#adding missing projects to the active projects frame
missing_projects_df = pd.DataFrame([
    ['pawikisource', 'Punjabi Wikisource', 'Punjabi', float('nan')],
    ['dtywiki', 'Doteli Wikipedia', 'Doteli', float('nan')]
], columns=active_projects.columns)

active_projects = active_projects.append(missing_projects_df, ignore_index=True)
print(active_projects.shape)
active_projects.tail()

(648, 4)


Unnamed: 0,project_key,project_name,language,project_group
643,elwikibooks,Greek Wikibooks,Greek,
644,klwiktionary,Kalaallisut Wiktionary,Kalaallisut,
645,cawikiquote,Catalan Wikiquote,Catalan,
646,pawikisource,Punjabi Wikisource,Punjabi,
647,dtywiki,Doteli Wikipedia,Doteli,


### Reclassifying sister projects
In previous iterations of Commmunity Insights sampling (prior to 2022), 14 large Wikipedias, Wikimedia Commons, and Wikidata, have their own groups, and rest of the Wikipedias are classified into various regional groups. In addition, all the sister projects (Wikisource, Wikivoyage etc.) are being classified to "Other" category. This is to be changed to reclassify sister projects from "Other" into respective regional groups as their language Wikipedias are. For example, if Hindi Wikisource is currently in "Other", it would be categorized into South Asia group, as Hindi Wikipedia belongs to South Asia.

In [97]:
# we want to preserve the groups of 16 large wikis 
# as these single-member groups are specific to respective Wikipedias, 
# no sister projects will be added to those group
large_wikis = ['English', 'commons', 'wikidata', 'Japanese',
               'German', 'French', 'metawiki', 'Spanish', 
               'Russian', 'Chinese', 'Italian', 'Portuguese',
               'Dutch', 'Arabic', 'Korean', 'Vietnamese']

languages_map = dict(active_projects[active_projects['project_group']
                                     .notna()][['language', 'project_group']].values)

for i in active_projects[active_projects.project_group.isna()].index:
    if active_projects.loc[i,:].language not in large_wikis:
        try:
            active_projects.loc[i, 'project_group'] = languages_map[active_projects.loc[i,:].language]
        except:
            pass

In [98]:
# review pending null values
active_projects.isna().sum()

project_key        0
project_name       0
language           0
project_group    105
dtype: int64

In [99]:
# languages haven't yet been classified to any group
active_projects[active_projects['project_group'].isna()].language.unique()

array(['English', 'Chinese', 'French', 'Russian', 'German', 'Italian',
       'Korean', 'Japanese', 'Spanish', 'Dutch', 'Dagbani', 'Portuguese',
       'Vietnamese', 'Taroko', 'Southern Altai', 'Inari Sami',
       'Tachelhit', 'Amis', 'Saraiki', 'Arabic', 'Manipuri', 'Nias',
       'Paiwan', 'Atayal', 'Madurese', 'Doteli'], dtype=object)

In [100]:
# assign groups to remaining language projects based on information provided by GDI team
misc_languages_map = {'English': 'other', 'Chinese': 'asia_wps', 'French': 'other', 'Russian': 'cee_wps',
                      'German': 'weur_wps', 'Italian': 'weur_wps', 'Korean': 'asia_wps', 'Japanese': 'asia_wps', 
                      'Spanish': 'other', 'Dutch': 'weur_wps', 'Dagbani': 'meaf_wps', 'Portuguese': 'other', 
                      'Vietnamese': 'asia_wps', 'Taroko': 'asia_wps', 'Southern Altai': 'cee_wps', 
                      'Inari Sami': 'weur_wps', 'Tachelhit': 'meaf_wps', 'Amis': 'asia_wps', 
                      'Saraiki': 'sasia_wps', 'Arabic': 'meaf_wps', 'Manipuri': 'sasia_wps',
                      'Nias': 'malay_wps', 'Paiwan': 'asia_wps', 'Atayal': 'asia_wps',
                      'Madurese': 'malay_wps', 'Doteli': 'sasia_wps'}

for i in active_projects[active_projects.project_group.isna()].index:
    try: active_projects.loc[i, 'project_group'] = misc_languages_map[active_projects.loc[i,:].language]
    except Exception as e: print(e)

In [101]:
# review pending null values
active_projects.isna().sum()

project_key      0
project_name     0
language         0
project_group    0
dtype: int64

In [103]:
# Wikimedia Foundation's internal wiki should be removed
foundationwiki_index = active_projects[active_projects.project_key == 'foundationwiki'].index
active_projects = active_projects.drop(index=foundationwiki_index)
active_projects.set_index('project_key', inplace=True)

In [104]:
print(active_projects.shape)
active_projects.head()

(647, 3)


Unnamed: 0_level_0,project_name,language,project_group
project_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
enwiki,English Wikipedia,English,enwiki
commonswiki,Wikimedia Commons,English,commons
wikidatawiki,Wikidata,English,wikidata
jawiki,Japanese Wikipedia,Japanese,jawiki
dewiki,German Wikipedia,German,dewiki


# Save the updated list

In [109]:
(active_projects
.sort_values("project_name")
.reset_index()
[["project_name", "project_key", "project_group"]]
# For some reason, this doesn't overwrite the old file sometimes
.to_csv("definitions/project-group-assignments.tsv", sep = "\t", index=False)
)