In [29]:
import requests
import pandas as pd
from pandas import DataFrame

In [24]:
def extract_titles_and_pageids(wiki_url):
    """
    Function to pull in all category members of a Wikipedia/Wiktionary category page.
    This function returns a tuple containing the title of each page which can be flattened into
    a list.
    """
    
    # Pull in a global variable to store cmcontinue string which will allow the function to pull 
    # in each batch of 500 results.
    global cmcontinue
    
    # Define the relevant URL. cmcontinue is not needed for the first 500 results, so an if-else
    # branching checks to see if a value has been extracted yet, and if not, does not use this
    # field in the API query.
    if cmcontinue == '':
        url = wiki_url
    else:
        url = '{}&cmcontinue={}'.format(wiki_url, cmcontinue)
    
    # Use the above URL to request data about the category page and get the number of category
    # members returned.
    resp = requests.get(url)
    data = resp.json()
    l = len(data['query']['categorymembers'])
    
    # cmcontinue is only used if the last set of results haven't been reached, so an if-else 
    # branching needs to be put in place to accommodate for this field not being present.
    if data.get('continue', 0) == 0:
        cmcontinue = None
    else:
        cmcontinue = data['continue']['cmcontinue']
    
    # Extract the titles and page ids
    t = [data['query']['categorymembers'][x]['title'] for x in range(0,l)]
    
    # Return title, page ids and cmcontinue string.
    return (t, cmcontinue)

In [26]:
# Extract feminine nouns
fem_url = 'https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:German_feminine_nouns&cmlimit=500&format=json'
cmcontinue = ''

all_results = []
while cmcontinue is not None:
    all_results += extract_titles_and_pageids(fem_url)
    
fem_nouns = all_results[0::2]
fem_nouns = [i for s in fem_nouns for i in s]

In [31]:
# Extract masculine nouns
masc_url = 'https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:German_masculine_nouns&cmlimit=500&format=json'
cmcontinue = ''

all_results = []
while cmcontinue is not None:
    all_results += extract_titles_and_pageids(masc_url)
    
masc_nouns = all_results[0::2]
masc_nouns = [i for s in masc_nouns for i in s]

In [33]:
# Extract neutral nouns
neut_url = 'https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:German_neuter_nouns&cmlimit=500&format=json'
cmcontinue = ''

all_results = []
while cmcontinue is not None:
    all_results += extract_titles_and_pageids(neut_url)
    
neut_nouns = all_results[0::2]
neut_nouns = [i for s in neut_nouns for i in s]

In [38]:
fem_df = DataFrame(fem_nouns, columns = ['noun'])
fem_df['gender'] = 'feminine'

masc_df = DataFrame(masc_nouns, columns = ['noun'])
masc_df['gender'] = 'masculine'

neut_df = DataFrame(neut_nouns, columns = ['noun'])
neut_df['gender'] = 'neutral'

In [39]:
nouns = fem_df.append(masc_df)
nouns = nouns.append(neut_df)

In [40]:
nouns

Unnamed: 0,noun,gender
0,Aa,feminine
1,Aa-Wurst,feminine
2,Aachenerin,feminine
3,Aalbeere,feminine
4,Aalhaut,feminine
5,Aalmutter,feminine
6,Aam,feminine
7,Aare,feminine
8,Aarmühle,feminine
9,Aasfliege,feminine
