In [1]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import requests
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
def extract_numeric_users(text):
    # Use a regex to find a number with commas (e.g., "7,500")
    match = re.search(r'([\d,]+)', text)
    if match:
        # Remove commas and convert to int
        return int(match.group(1).replace(',', ''))
    return None

def clean_text(text):
    # Remove a trailing dot if present
    return text.rstrip('.') if text.endswith('.') else text

def fix_language_name(name):
    # Check if the language name contains a comma
    if ',' in name:
        # Split the string and strip extra spaces
        parts = [part.strip() for part in name.split(',')]
        # If exactly two parts, return the swapped version
        if len(parts) == 2:
            return f"{parts[1]} {parts[0]}"
    return name

In [3]:
# Open and read the HTML file
with open('../data/nl_ethno.html', 'r', encoding='utf-8') as f:
    html = f.read()

# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Find the languages section using the section id "languages"
languages_section = soup.find('section', id='languages')

# Find all language header blocks
language_headers = languages_section.find_all('div', class_='languages__label entry__label')

In [4]:
len(language_headers)

95

In [5]:
languages_data = []

# Loop over each language header block
for header in language_headers:
    # Extract language name (assuming the first text node is the name)
    language_name = header.contents[0].strip() if header.contents else None

    # Extract the language code from the <a> tag inside the header
    code_tag = header.find('a', class_='chip')
    language_code = code_tag.get_text(strip=True) if code_tag else None

    # Find the corresponding details block: the next <ul> with class "languages__content entry__content"
    details_ul = header.find_next_sibling('ul', class_='languages__content entry__content')

    details = {}
    
    if details_ul:
        # Each detail is indicated by an <i> tag and the following text
        for i_tag in details_ul.find_all('i'):
            # The key is the text in the <i> tag (without the trailing colon)
            key = i_tag.get_text(strip=True).rstrip(':')
            value = ""
            next_sibling = i_tag.next_sibling
            if next_sibling:
                value = next_sibling.strip()
            
            # For certain keys, remove a trailing dot
            if key in {"Location", "Status", "Alternate Names", "Autonym", "Classification"}:
                value = clean_text(value)
            
            # For Users, extract the numeric value
            if key == "Users":
                numeric_value = extract_numeric_users(value)
                details[key] = numeric_value
            else:
                details[key] = value

    # Combine language name, code, and details into a single dict
    lang_entry = {
        'Language': language_name,
        'Code': language_code,
    }
    lang_entry.update(details)
    languages_data.append(lang_entry)

# Create a pandas DataFrame
df = pd.DataFrame(languages_data)

# Sort by column 'A' in ascending order and column 'B' in descending order
df = df.sort_values(by=['Status', 'Users'], ascending=[False, False])

# Desired column order
new_order = ['Language', 'Classification', 'Status', 'Users', 'Location', 'Code', 'Alternate Names', 'Autonym']
df = df[new_order]

# Correct inverted names
df['Language'] = df['Language'].apply(fix_language_name)

# Drop index and display the DataFrame
df = df.reset_index(drop = True)

df

Unnamed: 0,Language,Classification,Status,Users,Location,Code,Alternate Names,Autonym
0,Moroccan Arabic,"Afro-Asiatic, Semitic, Central, South, Arabic",Unestablished,428000,,ary,,
1,Sranan Tongo,"Creole, English based, Atlantic, Suriname",Unestablished,368000,,srn,,
2,Indonesian,"Austronesian, Malayo-Polynesian, Malayo-Chamic...",Unestablished,349000,,ind,,
3,Turkish,"Turkic, Southern, Turkish",Unestablished,308000,,tur,,
4,Polish,"Indo-European, Balto-Slavic, Slavic, West, Lec...",Unestablished,221000,,pol,,
...,...,...,...,...,...,...,...,...
90,Limburgish,"Indo-European, Germanic, West, Low Saxon-Low F...",5 (Developing). Statutory language of provinci...,700000,"Limburg province: Heerlen, Maastricht, Roermon...",lim,"Limberger, Limburgan, Limburgian, Limburgic, L...",Lèmburgs
91,Sign Language of the Netherlands,"Sign language, Deaf community sign language","5 (Developing). Recognized language (2021, Wet...",20500,Scattered,dse,"Dutch Sign Language, NGT, Nederlandse Gebarent...",
92,English,"Indo-European, Germanic, West, English",4 (Educational),15805000,Widespread,eng,Engels,
93,Frisian,"Indo-European, Germanic, West, Frisian",2 (Provincial). Statutory provincial language ...,718000,Friesland province; Groningen province: De Mar...,fry,"Fries, Westlauwers Fries",Frysk


In [6]:
df.to_csv('../data/nl_languages.csv', index=False)