# Fill in missing data
Where we have a reasonable guess.

Two scenarios
1. Fill in missing data from scraping error (ie no Mandarin speakers in China)
2. When we have total speaker data for most years, but only L1 and L2 for a few years, we can reasonably assume that the ratio L1/population is somewhat constant, and then calculate L2 from total and guessed L1

## Fix Scrape Error

In [None]:
import pandas as pd
import numpy as np

import math

In [None]:
# Helpful maps
lang_code_to_name = {"arz": "Egyptian Arabic", "ben": "Bengali", "cmn": "Mandarin Chinese", "deu": "German", "eng": "English", "fra": "French", "hin": "Hindi", "ind": "Indonesian", "jpn": "Japanese", "por": "Portuguese", "rus": "Russian", "spa": "Spanish", "urd": "Urdu"}

name_to_lang_code = {"Egyptian Arabic": "arz", "Bengali": "ben", "Mandarin Chinese": "cmn", "German": "deu", "English": "eng", "French": "fra", "Hindi": "hin", "Indonesian": "ind", "Japanese": "jpn", "Portuguese": "por", "Russian": "rus", "Spanish": "spa", "Urdu": "urd"}

lang_code_to_country = {"arz": "Egypt", "ben": "Bangladesh", "cmn": "China", "deu": "Germany", "eng": "United Kingdom", "fra": "France", "hin": "India", "ind": "Indonesia", "jpn": "Japan", "por": "Portugal", "rus": "Russian Federation", "spa": "Spain", "urd": "Pakistan"}

In [None]:
speaker_types = ["total", "l1", "l2"]

finished_dfs = []
for speaker_type in speaker_types:
    speakers_df = pd.read_csv(f'../data/cleaned_{speaker_type}_speakers.csv')

    # Set country and year as index
    speakers_df.set_index(['Year', "Country"], inplace=True)

    # import manual total speakers data  
    manual_speakers = pd.read_csv(f'../misc_data/manual_{speaker_type}_speakers.csv')
    
    manual_speakers.set_index('Language', inplace=True)
    # Cast col names to ints

    # Map the manual_speakers index to language codes
    manual_speakers.index = manual_speakers.index.map(name_to_lang_code)
    # print(manual_speakers)

    years = speakers_df.index.get_level_values(0).unique()

    for year in years:
        # fetch the global number of speakers per language from manual_speakers
        manual_speakers_year = manual_speakers[str(year)]
        # print(manual_speakers_year)

        # aggregate the global speakers per language from speakers_df 
        speakers_df_year = speakers_df.loc[year]
        agg_speakers_df_year = speakers_df_year.sum(axis=0)
        # print(agg_speakers_df_year)

        # compute the difference to find missing data
        diff = manual_speakers_year - agg_speakers_df_year 
        # print(diff)

        # if we're missing data entirely from that year in the country data, don't fill anything in, otherwise fill in the missing data for the right country

        languages = diff.index[diff.index.notnull()]

        for lang in languages:
            if agg_speakers_df_year[lang] == 0:
                print(year)
                print("agg_speakers_df_year[lang]: ", agg_speakers_df_year[lang])
                continue # we're missing all the data, don't fill anything in
            else:
                # fill in the missing data for the right country
                speakers_df.loc[year, lang_code_to_country[lang]][lang] = diff[lang]
                # print("diff[lang]: ", diff[lang])
                # print("speakers_df.loc[year, lang_code_to_country[lang]][lang]: ", speakers_df.loc[year, lang_code_to_country[lang]][lang])
    finished_dfs.append(speakers_df)



In [None]:
total_df = finished_dfs[0]
l1_df = finished_dfs[1]
l2_df = finished_dfs[2]

In [None]:
total_df = total_df.reorder_levels(["Country", "Year"])
l1_df = l1_df.reorder_levels(["Country", "Year"])
l2_df = l2_df.reorder_levels(["Country", "Year"])

total_df.loc[["Portugal",  "Brazil"]]

In [None]:
# bring in population data to verify that we're not overestimating

# this code copied from `streamlined_ratio_model.ipynb`
pop_df = pd.read_csv('../raw_data/World_Population_Data.csv', header=2)
pop_df = pop_df.drop(columns = ["Indicator Name", "Indicator Code", "Country Code", "Unnamed: 67"])
pop_df = pop_df.set_index("Country Name")
# rename index to Country
pop_df.index.names = ["Country"]

pop_df.columns = pop_df.columns.astype(int)

pop_to_lang_country_map = {"Bahamas, The": "Bahamas", "Brunei Darussalam": "Brunei", "Cabo Verde": "Cape Verde Islands", "Hong Kong SAR, China": "Hong Kong", "Macao SAR, China": "Macao", "Congo, Rep.": "Congo", "Congo, Dem. Rep.": "Democratic Republic of the Congo", "Cote d'Ivoire": "Côte d’Ivoire", "Timor-Leste": "East Timor", "Egypt, Arab Rep.": "Egypt", "Gambia, The": "Gambia", "Iran, Islamic Rep.": "Iran", "Kyrgyz Republic": "Kyrgyzstan", "Lao PDR": "Laos", "West Bank and Gaza": "Palestine", 'St. Kitts and Nevis': "Saint Kitts and Nevis", "St. Lucia": "Saint Lucia", "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", "St. Martin (French part)": "Saint Martin", "Sint Maarten (Dutch part)": "Sint Maarten", "Slovak Republic": "Slovakia", "Korea, Rep.": "South Korea", "Syrian Arab Republic": "Syria", "Sao Tome and Principe": "São Tomé e Príncipe", "Turkiye": "Turkey", "Virgin Islands (U.S.)": "U.S. Virgin Islands", "Venezuela, RB": "Venezuela", "Viet Nam": "Vietnam", "Yemen, Rep.": "Yemen"}

# not included
# Anguilla, British Indian Ocean Territory, Caribbean Netherlands, Taiwan, Christmas Island, Cocos (Keeling) Islands, Cook Islands, Falkland Islands, French Guiana, Guadaloupe, Guernsey, Martinique, Mayotte, Niue, Norfolk Island, Réunion, Saint Barthélemy, Saint Helena, Saint Helena, Ascension, and Tristan da Cunha, Saint Pierre and Miquelon, Taiwan, Tokelau, Wallis and Futuna, Western Sahara

# ISSUES - fixed in the original Ethnologue processing `data_cleaning_notebooks/extract_ethnologue_data.ipynb`
# Czechia, it changed named and messed up the data
# Eswatini also changed name!!
# Vietnam changed from Viet Nam to Vietnam

# Rename the countries in the population df to match the language df
pop_df.index = pop_df.index.map(lambda x: pop_to_lang_country_map[x] if x in pop_to_lang_country_map else x)


pop_df

stacked_pop_df = pop_df.stack()
stacked_pop_df.index.names = ["Country", "Year"]
stacked_pop_df

total_df = total_df.assign(Population=stacked_pop_df)
l1_df = l1_df.assign(Population=stacked_pop_df)
l2_df = l2_df.assign(Population=stacked_pop_df)

total_df

In [None]:
# print out all the entries where the population is less than the number of speakers

for df, name in zip([total_df, l1_df, l2_df], ["total", "l1", "l2"]):
    print(name)

    languages = df.columns[:-1] # don't include the population column

    for index, row in df.iterrows():
        for lang in languages:
            if row[lang] > row["Population"]:
                print(index, lang, row[lang], row["Population"])