# Extract Ethnologue data

If we ever scrape again and get more data, use this script to recreate the total speakers, L1 and L2 data files 

In [None]:
# Filepaths

import os

# rename these filepaths if you don't want to overwrite the original files
total_speakers_filepath = os.path.join("..", "data", "cleaned_total_speakers.csv")
l1_speakers_filepath = os.path.join("..", "data", "cleaned_l1_speakers.csv")
l2_speakers_filepath = os.path.join("..", "data", "cleaned_l2_speakers.csv")

print(total_speakers_filepath)

In [None]:
import pandas as pd
import numpy as np

In [None]:
# import data
lang_df = pd.read_table("../populations.tsv", sep="\t", header=None)

# rename columns
lang_df.rename(columns={0: "Language", 1: "Country", 2:16, 3:17, 4:18, 5:19, 6:20, 7:21, 8:22, 9:23, 10:25, 11:"drop"}, inplace=True)

# Strip whitespace from the Language and Country columns
lang_df["Language"] = lang_df["Language"].str.strip()
lang_df["Country"] = lang_df["Country"].str.strip()

# set the index to the first two columns
lang_df.set_index(["Language", "Country"], inplace=True)


lang_df

## Total Speakers

In [None]:
# Grab just the total number of speakers (ie the first number in the tuple)

def extract_total_speakers(cell):

    if cell == 0:
        return 0
    
    # try to check if the cell is nan
    try:
        if np.isnan(cell):
            return 0
    except:
        pass

    eval_cell = eval(cell)
    if type(eval_cell) == tuple:
        return eval_cell[0]
    else:
        return eval_cell
    

# Create a new df where each cell has just the total number of speakers
total_speakers_df = lang_df.applymap(extract_total_speakers)
# drop the column labeled "drop"
total_speakers_df.drop("drop", axis=1, inplace=True)
total_speakers_df

In [None]:
edition_to_year= {13:1996, 14: 2000, 15:2005, 16:2009, 17:2014, 18:2015, 19:2016, 20:2017, 21:2018, 22:2019, 23:2020, 24:2021, 25:2022}



total_speakers_df= total_speakers_df.rename(columns=edition_to_year)
total_speakers_df= total_speakers_df.stack()
total_speakers_df= total_speakers_df.unstack("Language") 

# name the second index "Year"
total_speakers_df.index.names = ["Country", "Year"]

total_speakers_df

## L1

In [None]:
# Grab just the L1 number of speakers (ie the second number in the tuple)

def extract_L1_speakers(cell):

    if cell == 0:
        return 0
    
    # try to check if the cell is nan
    try:
        if np.isnan(cell):
            return 0
    except:
        pass

    eval_cell = eval(cell)
    if type(eval_cell) == tuple:
        return eval_cell[1]
    else:
        return 0
    

# Create a new df where each cell has just the total number of speakers
l1_speakers_df = lang_df.applymap(extract_L1_speakers)
# drop the column labeled "drop"
l1_speakers_df.drop("drop", axis=1, inplace=True)
l1_speakers_df

In [None]:
l1_speakers_df= l1_speakers_df.rename(columns=edition_to_year)
l1_speakers_df= l1_speakers_df.stack()
l1_speakers_df= l1_speakers_df.unstack("Language") 

# name the second index "Year"
l1_speakers_df.index.names = ["Country", "Year"]

l1_speakers_df

## L2

In [None]:
# Grab just the L2 number of speakers (ie the second number in the tuple)

def extract_L2_speakers(cell):

    if cell == 0:
        return 0
    
    # try to check if the cell is nan
    try:
        if np.isnan(cell):
            return 0
    except:
        pass

    eval_cell = eval(cell)
    if type(eval_cell) == tuple:
        return eval_cell[2]
    else:
        return 0
    

# Create a new df where each cell has just the total number of speakers
l2_speakers_df = lang_df.applymap(extract_L2_speakers)
# drop the column labeled "drop"
l2_speakers_df.drop("drop", axis=1, inplace=True)
l2_speakers_df

In [None]:
l2_speakers_df= l2_speakers_df.rename(columns=edition_to_year)
l2_speakers_df= l2_speakers_df.stack()
l2_speakers_df= l2_speakers_df.unstack("Language") 

# name the second index "Year"
l2_speakers_df.index.names = ["Country", "Year"]

l2_speakers_df

In [None]:
l2_speakers_df.loc["Czechia"]

# Fix Country Name Changes

In [None]:
def combine_country_with_two_names(name1, name2, df):
    """When a country changes names, we often have two rows for the same country. This function combines them into one row."""
    name1_df = df.loc[name1]
    # print(1)
    # print(name1_df)
    name2_df = df.loc[name2]
    # print(2)
    # print(name2_df)

    combined_df = name1_df.add(name2_df, fill_value=0)
    # print("Combined")
    # print(combined_df)

    # make a copy of the df
    df_modified = df.copy()
    # cols = df_modified.columns
    # for col in cols:
    #     df_modified.loc[name1, col] = combined_df[col]

    # iterate through the rows, find name1, and add the right value from name2df
    for row in df_modified.iterrows():
        # print(row)
        # print(row[0])
        # print()
        if row[0][0] == name1:
            # print("Found name1")
            for col in df_modified.columns:
                # print(combined_df.loc[row[0][1], col])
                # print(df_modified.loc[row[0], col])
                df_modified.loc[row[0], col] = combined_df.loc[row[0][1], col]
        

    return df_modified.drop(name2)

In [None]:
combine_country_with_two_names("Czechia", "Czech Republic", total_speakers_df).loc["Czechia"]


In [None]:
dfs = [total_speakers_df, l1_speakers_df, l2_speakers_df]

new_dfs = []
for df in dfs:
    df = combine_country_with_two_names("Czechia", "Czech Republic", df)
    df = combine_country_with_two_names("Vietnam", "Viet Nam", df)
    df = combine_country_with_two_names("Eswatini", "Swaziland", df)
    df = combine_country_with_two_names("Taiwan", "China–Taiwan", df)
    df = combine_country_with_two_names("Hong Kong", "China–Hong Kong", df)
    df = combine_country_with_two_names("Macao", "China–Macao", df)
    new_dfs.append(df)


total_speakers_df = new_dfs[0]
l1_speakers_df = new_dfs[1]
l2_speakers_df = new_dfs[2]

In [None]:
# verify that Czechia and Czech Republic are combined
total_speakers_df.loc["Czechia"]

In [None]:
# drop duplicated 

# Export to csv files

In [None]:
total_speakers_df.to_csv(total_speakers_filepath)
l1_speakers_df.to_csv(l1_speakers_filepath)
l2_speakers_df.to_csv(l2_speakers_filepath)