In [1]:
import json
import re

import pandas as pd
import requests
from wmfdata import mariadb

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
def get_dblist(list_name):
    list_url = ("https://noc.wikimedia.org/conf/dblists/" + list_name + ".dblist")
    list_content = requests.get(list_url).text.split("\n")
    return pd.Series(list_content)

# Basic data from MediaWiki sites table

In [4]:
wikis = mariadb.run("""
select
    site_global_key as database_code,
    concat(trim(leading "." from reverse(site_domain))) as domain_name,
    site_group as database_group,
    site_language as language_code
from sites
""", "enwiki").sort_values("database_code").set_index("database_code")

wikis.head(10)

Unnamed: 0_level_0,domain_name,database_group,language_code
database_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aawiki,aa.wikipedia.org,wikipedia,aa
aawikibooks,aa.wikibooks.org,wikibooks,aa
aawiktionary,aa.wiktionary.org,wiktionary,aa
abwiki,ab.wikipedia.org,wikipedia,ab
abwiktionary,ab.wiktionary.org,wiktionary,ab
acewiki,ace.wikipedia.org,wikipedia,ace
advisorswiki,advisors.wikimedia.org,advisors,en
advisorywiki,advisory.wikimedia.org,advisory,en
adywiki,ady.wikipedia.org,wikipedia,ady
afwiki,af.wikipedia.org,wikipedia,af


# Language names

In [6]:
lang_urls = [
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/CldrNames/CldrNamesEn.php",
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/LocalNames/LocalNamesEn.php"
]

def get_lang_names(url):
    r = requests.get(url)
    m = re.search(r"languageNames = (\[[\s\S]+?\])", r.text)
    php_ln = m.group(1)
    
    json_ln = php_ln
    repl = [
        # Convert from PHP array format to JSON
        (" =>", ":"),
        ("\[", "{"),
        ("\]", "}"),
        # Trailing commas will cause problems
        (",\n}", "\n}"),
        # ...so will single quotes
        ("'", '"'),
        # ...and comments
        (r"/\*[\s\S]*?\*/", ""),
        (r"#(.*?)\n", ""),
        # One hack to deal with a single quote in a language name
        ('O"odham', "O'odham")
    ]
    for old, new in repl:
        json_ln = re.sub(old, new, json_ln)
    
    py_ln = json.loads(json_ln)
    return py_ln

langs = {}
for url in lang_urls:
    langs.update(get_lang_names(url))

# Add languages not included in the CLDR files
langs.update({
    "als": "Alsatian",
    "atj": "Atikamekw",
    "diq": "Zazaki",
    "fiu-vro": "Võro",
    "map-bms": "Banyumasan",
    "nah": "Nahuatl",
    "pih": "Norfuk-Pitkern",
    "rmy": "Vlax Romani",
    "simple": "Simple English"
})

wikis["language_name"] = wikis["language_code"].apply(langs.get)

wikis.head(10)

Unnamed: 0_level_0,domain_name,database_group,language_code,language_name
database_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aawiki,aa.wikipedia.org,wikipedia,aa,Afar
aawikibooks,aa.wikibooks.org,wikibooks,aa,Afar
aawiktionary,aa.wiktionary.org,wiktionary,aa,Afar
abwiki,ab.wikipedia.org,wikipedia,ab,Abkhazian
abwiktionary,ab.wiktionary.org,wiktionary,ab,Abkhazian
acewiki,ace.wikipedia.org,wikipedia,ace,Achinese
advisorswiki,advisors.wikimedia.org,advisors,en,English
advisorywiki,advisory.wikimedia.org,advisory,en,English
adywiki,ady.wikipedia.org,wikipedia,ady,Adyghe
afwiki,af.wikipedia.org,wikipedia,af,Afrikaans


# Closed and private wikis

In [7]:
closed = get_dblist("closed")
private = get_dblist("private")

def apply_to_index(df, true_list, true_label, false_label):
    idx_ser = df.index.to_series()
    return idx_ser.isin(true_list).apply(lambda x: true_label if x else false_label)

wikis = (
    wikis
    .assign(
        status=lambda df: apply_to_index(df, closed, "closed", "open"),
        visbility=lambda df: apply_to_index(df, private, "private", "public")
    )
)

wikis.head(10)

Unnamed: 0_level_0,domain_name,database_group,language_code,language_name,status,visbility
database_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aawiki,aa.wikipedia.org,wikipedia,aa,Afar,closed,public
aawikibooks,aa.wikibooks.org,wikibooks,aa,Afar,closed,public
aawiktionary,aa.wiktionary.org,wiktionary,aa,Afar,closed,public
abwiki,ab.wikipedia.org,wikipedia,ab,Abkhazian,open,public
abwiktionary,ab.wiktionary.org,wiktionary,ab,Abkhazian,closed,public
acewiki,ace.wikipedia.org,wikipedia,ace,Achinese,open,public
advisorswiki,advisors.wikimedia.org,advisors,en,English,open,private
advisorywiki,advisory.wikimedia.org,advisory,en,English,closed,public
adywiki,ady.wikipedia.org,wikipedia,ady,Adyghe,open,public
afwiki,af.wikipedia.org,wikipedia,af,Afrikaans,open,public


# Site families—UNFINISHED!

In [27]:
wikis.query("database_code.str.contains('test')")

Unnamed: 0,database_code,site_group,language_code,domain_name
857,testwiki,test,en,https://test.wikipedia.org
858,test2wiki,test2,en,https://test2.wikipedia.org
859,testwikidatawiki,testwikidata,en,https://test.wikidata.org
895,labtestwiki,labtest,en,https://labtestwikitech.wikimedia.org
939,testcommonswiki,testcommons,en,https://test-commons.wikimedia.org


In [24]:
wikis["site_group"].unique()

array(['wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource',
       'wikinews', 'wikiversity', 'wikivoyage', 'wikidata', 'advisory',
       'arwikimedia', 'arbcom-de', 'arbcom-en', 'arbcom-fi', 'arbcom-nl',
       'auditcom', 'bdwikimedia', 'bewikimedia', 'betawikiversity',
       'board', 'boardgovcom', 'brwikimedia', 'chair', 'chapcom',
       'checkuser', 'cowikimedia', 'collab', 'commons', 'dkwikimedia',
       'donate', 'etwikimedia', 'exec', 'fdc', 'fiwikimedia',
       'foundation', 'grants', 'iegcom', 'ilwikimedia', 'incubator',
       'internal', 'login', 'mediawiki', 'meta', 'mkwikimedia',
       'movementroles', 'mxwikimedia', 'nlwikimedia', 'nowikimedia',
       'noboard-chapterswikimedia', 'nostalgia', 'nycwikimedia',
       'nzwikimedia', 'office', 'ombudsmen', 'otrs-wiki', 'outreach',
       'pa-uswikimedia', 'plwikimedia', 'quality', 'rswikimedia',
       'ruwikimedia', 'sewikimedia', 'searchcom', 'sources', 'spcom',
       'species', 'steward', 'strategy',

In [22]:
wikis.query("site_group == 'wikimania'")

Unnamed: 0,database_code,site_group,language_code,domain_name
933,wikimaniawiki,wikimania,en,https://wikimania.wikimedia.org


In [None]:
# Site groups where the project names follow the pattern of language code + site group
standard_names = [
    "wikibooks",
    "wikinews",
    "wikipedia",
    "wikiquote",
    "wikisource",
    "wikiversity",
    "wikivoyage",
    "wiktionary",
]

# Site groups where project names don't follow the pattern of language code + site group
custom_families = {
    "Test": {
        "test": "Test Wiki"
        "test2": "Test 2 Wiki"
        "testcommons":  "Test Wikimedia Commons",
        "testwikidata": "Test Wikidata""
    },
    "Affiliates": {
        "punjabiwikimedia": "Punjabi Wikimedians User Group"
    },
    
}


affiliate = get_dblist("wikimedia")
wikimania = get_dblist("wikimania")


def proj_name(row):
    proj = row.loc["project_code"]
    unified = unified_projects.get(proj)
    if unified:
        return unified
    else:
        return proj.title()

def wiki_name(row):
    proj = row.loc["project_code"]
    unified = unified_projects.get(proj)
    if unified:
        return unified
    else:
        return " ".join([lang_name(row), proj_name(row)])
    
wikis["project_name"] = wikis.apply(proj_name, axis=1)
wikis["wiki_name"] = wikis.apply(wiki_name, axis=1)

# Write CSV

In [8]:
wikis.to_csv("../wikis.csv")