# Ratio Model 

We're going to combine the historic language data with historic population data using a ratio and then taking the log odds. Then, we'll project into the future linearly, undo the log odds, and multiply by UN population projections to get the 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os

from math import nan

import ratio_model_utils as utils

## Bring in data

Most of this data was preprocessed by scripts in the folder `data_cleaning_notebooks` with raw data from the `raw_data` folder.

In [None]:
# Load in files and set first two colums as index

total_speakers_df = pd.read_csv(os.path.join("data", "interpolated_total_speakers.csv"))
total_speakers_df = total_speakers_df.set_index(["Country", "Year"])
l1_speakers_df = pd.read_csv(os.path.join("data", "interpolated_l1_speakers.csv"))
l1_speakers_df = l1_speakers_df.set_index(["Country", "Year"])
l2_speakers_df = pd.read_csv(os.path.join("data", "interpolated_l2_speakers.csv"))
l2_speakers_df = l2_speakers_df.set_index(["Country", "Year"])

total_speakers_df

In [None]:
pop_df = pd.read_csv(os.path.join("data", "population_historic.csv"), header=0)
pop_df = pop_df.set_index(["Country", "Year"])
pop_df

In [None]:
# pop_df = pop_df.drop(columns = ["Indicator Name", "Indicator Code", "Country Code", "Unnamed: 67"])
# pop_df = pop_df.set_index("Country Name")
# # rename index to Country
# pop_df.index.names = ["Country"]

# pop_df.columns = pop_df.columns.astype(int)
# pop_df

### Check which country names don't align

Different data sources use different names, next time we'll use the ISO country codes.

In [None]:
lang_countries = total_speakers_df.index.get_level_values(0).unique()
pop_countries = pop_df.index.get_level_values(0).unique()

len(lang_countries), len(pop_countries)

In [None]:
# find all the countries that only appear in the language df
lang_only_countries = lang_countries.difference(pop_countries)
lang_only_countries

In [None]:
# find all the countries that only appear in the population df
pop_only_countries = pop_countries.difference(lang_countries)
pop_only_countries

In [None]:
# pop_to_lang_country_map = {"Bahamas, The": "Bahamas", "Brunei Darussalam": "Brunei", "Cabo Verde": "Cape Verde Islands", "Hong Kong SAR, China": "Hong Kong", "Macao SAR, China": "Macao", "Congo, Rep.": "Congo", "Congo, Dem. Rep.": "Democratic Republic of the Congo", "Cote d'Ivoire": "Côte d’Ivoire", "Timor-Leste": "East Timor", "Egypt, Arab Rep.": "Egypt", "Gambia, The": "Gambia", "Iran, Islamic Rep.": "Iran", "Kyrgyz Republic": "Kyrgyzstan", "Lao PDR": "Laos", "West Bank and Gaza": "Palestine", 'St. Kitts and Nevis': "Saint Kitts and Nevis", "St. Lucia": "Saint Lucia", "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", "St. Martin (French part)": "Saint Martin", "Sint Maarten (Dutch part)": "Sint Maarten", "Slovak Republic": "Slovakia", "Korea, Rep.": "South Korea", "Syrian Arab Republic": "Syria", "Sao Tome and Principe": "São Tomé e Príncipe", "Turkiye": "Turkey", "Virgin Islands (U.S.)": "U.S. Virgin Islands", "Venezuela, RB": "Venezuela", "Viet Nam": "Vietnam", "Yemen, Rep.": "Yemen"}

# # not included
# # Anguilla, British Indian Ocean Territory, Caribbean Netherlands, Taiwan, Christmas Island, Cocos (Keeling) Islands, Cook Islands, Falkland Islands, French Guiana, Guadaloupe, Guernsey, Martinique, Mayotte, Niue, Norfolk Island, Réunion, Saint Barthélemy, Saint Helena, Saint Helena, Ascension, and Tristan da Cunha, Saint Pierre and Miquelon, Taiwan, Tokelau, Wallis and Futuna, Western Sahara

# # ISSUES - fixed in the original Ethnologue processing `data_cleaning_notebooks/extract_ethnologue_data.ipynb`
# # Czechia, it changed named and messed up the data
# # Eswatini also changed name!!
# # Vietnam changed from Viet Nam to Vietnam

# # Rename the countries in the population df to match the language df
# pop_df.index = pop_df.index.map(lambda x: pop_to_lang_country_map[x] if x in pop_to_lang_country_map else x)

# # verify that all the countries in the language df are in the population df
# pop_countries = pop_df.index.unique()
# lang_countries.difference(pop_countries)

### Combine Population and Language Data



In [None]:
# stacked_pop_df = pop_df.stack()
# stacked_pop_df.index.names = ["Country", "Year"]
# stacked_pop_df

In [None]:
total_speakers_df = total_speakers_df.assign(Population=pop_df)
l1_speakers_df = l1_speakers_df.assign(Population=pop_df)
l2_speakers_df = l2_speakers_df.assign(Population=pop_df)

l1_speakers_df

### Country-Region Mapping

In [None]:
country_region_df = pd.read_csv(os.path.join("data", "regions.csv"))
country_region_df = country_region_df.set_index("Country")
# convert to series
country_region_series = pd.Series(index=country_region_df.index, data=country_region_df["Region"])
country_region_series

In [None]:
# Add the regions to the total_speakers_df
# Apply the mapping to the 0th level of the index
total_speakers_df = total_speakers_df.assign(Region=total_speakers_df.index.get_level_values(0).map(country_region_series))

# Add the regions to the l1_speakers_df
l1_speakers_df = l1_speakers_df.assign(Region=l1_speakers_df.index.get_level_values(0).map(country_region_series))

# Add the regions to the l2_speakers_df
l2_speakers_df = l2_speakers_df.assign(Region=l2_speakers_df.index.get_level_values(0).map(country_region_series))

l1_speakers_df

In [None]:
# Which countries have no region?
total_speakers_df[total_speakers_df["Region"].isnull()].index.get_level_values(0).unique()

In [None]:
# Drop the countries with no region
total_speakers_df = total_speakers_df[total_speakers_df["Region"].notnull()]
l1_speakers_df = l1_speakers_df[l1_speakers_df["Region"].notnull()]
l2_speakers_df = l2_speakers_df[l2_speakers_df["Region"].notnull()]

l1_speakers_df

In [None]:
# Drop the countries with no population
total_speakers_df = total_speakers_df[total_speakers_df["Population"].notnull()]
l1_speakers_df = l1_speakers_df[l1_speakers_df["Population"].notnull()]
l2_speakers_df = l2_speakers_df[l2_speakers_df["Population"].notnull()]


### Sum data by region

The idea is to smooth out country level inaccuracies (and infrequent data collection) by grouping countries together.

In [None]:
region_total_speakers_df = total_speakers_df.groupby(["Region", "Year"]).sum()
region_l1_speakers_df = l1_speakers_df.groupby(["Region", "Year"]).sum()
region_l2_speakers_df = l2_speakers_df.groupby(["Region", "Year"]).sum()

region_l1_speakers_df

In [None]:
ratio_region_total_speakers_df = utils.regional_numbers_to_ratios(region_total_speakers_df)
ratio_region_l1_speakers_df = utils.regional_numbers_to_ratios(region_l1_speakers_df)
ratio_region_l2_speakers_df = utils.regional_numbers_to_ratios(region_l2_speakers_df)

ratio_region_l1_speakers_df.head(20)

# Plot to verify correctness


In [None]:
regions = ratio_region_total_speakers_df.index.get_level_values(0).unique()
lang_codes = ratio_region_total_speakers_df.columns
lang_names = {"arz": "Arabic",	"ben": "Bengali","cmn": "Mandarin", "deu": "German",	"eng": "English",	"fra": "French",	"hin": "Hindi",	"ind": "Indonesian",	"jpn": "Japanese", 	"por": "Portuguese",	"rus": "Russian",	"spa": "Spanish",	"urd": "Urdu"}

color_list = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink", "tab:gray", "tab:olive", "tab:cyan", "m", "k", "lime"]

In [None]:
# Uncomment a block to plot each regional graph for the given type of speaker

# Total Speakers
for region in regions:
    utils.plot_region(region, lang_codes, lang_names, ratio_region_total_speakers_df, "Number of speakers / Population", "Total Speakers as a Fraction of Population", color_list)

# # L1 Speakers
# for region in regions:
#     utils.plot_region(region, lang_codes, lang_names, ratio_region_l1_speakers_df, "Number of speakers / Population", "L1 Speakers as a Fraction of Population", color_list)

# # L2 Speakers
# for region in regions:
#     utils.plot_region(region, lang_codes, lang_names, ratio_region_l2_speakers_df, "Number of speakers / Population", "L2 Speakers as a Fraction of Population", color_list)

# Log Odds

In [None]:
# total
log_odds_region_total_speakers_df = np.log(ratio_region_total_speakers_df / (1 - ratio_region_total_speakers_df))

# replace -inf with nan
log_odds_region_total_speakers_df.replace(-np.inf, np.nan, inplace=True)

# L1
log_odds_region_l1_speakers_df = np.log(ratio_region_l1_speakers_df / (1 - ratio_region_l1_speakers_df))

# replace -inf with nan
log_odds_region_l1_speakers_df.replace(-np.inf, np.nan, inplace=True)

# L2
log_odds_region_l2_speakers_df = np.log(ratio_region_l2_speakers_df / (1 - ratio_region_l2_speakers_df))

# replace -inf with nan
log_odds_region_l2_speakers_df.replace(-np.inf, np.nan, inplace=True)

log_odds_region_l2_speakers_df.head(20)

In [None]:
# Uncomment a block to plot each regional graph for the given type of speaker

# # Total Speakers
# for region in regions:
#     utils.plot_region(region, lang_codes, lang_names, log_odds_region_total_speakers_df, "log(ratio / (1 - ratio))", "Log Odds of Total Speakers as a Fraction of Population", color_list)

# # L1 Speakers
# for region in regions:
#     utils.plot_region(region, lang_codes, lang_names, log_odds_region_l1_speakers_df, "log(ratio / (1 - ratio))", "Log Odds of L1 Speakers as a Fraction of Population", color_list)

# # L2 Speakers
# for region in regions:
#     utils.plot_region(region, lang_codes, lang_names, log_odds_region_l2_speakers_df, "log(ratio / (1 - ratio))", "Log Odds of L2 Speakers as a Fraction of Population", color_list)

## Regression for log odds

In [None]:
regression_total_speakers_df = utils.create_log_odds_regression_line(log_odds_region_total_speakers_df, regions, lang_codes)

regression_l1_speakers_df = utils.create_log_odds_regression_line(log_odds_region_l1_speakers_df, regions, lang_codes)

regression_l2_speakers_df = utils.create_log_odds_regression_line(log_odds_region_l2_speakers_df, regions, lang_codes)

regression_l2_speakers_df

## Convert back to language speakers

Goal: A table with regional projections per language for 2025, 2030, 2035, 2040, 2045, 2050.

Break it down into baby steps
1. Use the regression lines to calculate values at 2025, 2030, 2035, 2040, 2045, 2050.
2. Undo the log-odds part to get a ratio
3. Use the projections to get a raw number
4. (BONUS) Graphically display which languages have big populations where 

In [None]:
# Use the regression lines to predict up to 2050 (implicit in the function, can change the years if desired)
future_log_odds_region_total_speakers_df = utils.create_future_log_odds_df(regression_total_speakers_df, regions, lang_codes)

future_log_odds_region_l1_speakers_df = utils.create_future_log_odds_df(regression_l1_speakers_df, regions, lang_codes)

future_log_odds_region_l2_speakers_df = utils.create_future_log_odds_df(regression_l2_speakers_df, regions, lang_codes)

future_log_odds_region_l2_speakers_df

### Undo Log Odds

Formula: Let $r$ be the ratio (which we want at the end of this step) and $l$ be the log odds (which we have).
$$
r = \frac{e^l}{1+e^l} = \frac{1}{e^{-l}+1}
$$

In [None]:
future_ratio_region_total_speakers_df = 1 / (1 + np.exp(-future_log_odds_region_total_speakers_df.astype(float)))

future_ratio_region_l1_speakers_df = 1 / (1 + np.exp(-future_log_odds_region_l1_speakers_df.astype(float)))

future_ratio_region_l2_speakers_df = 1 / (1 + np.exp(-future_log_odds_region_l2_speakers_df.astype(float)))

future_ratio_region_l2_speakers_df

### Undo ratios

Using population predictions

In [None]:
# Load in the population projections
pop_proj_df = pd.read_csv("data/population_projections.csv")
pop_proj_df = pop_proj_df.set_index(["Country", "Year"])
pop_proj_df

In [None]:
# Check which countries in the language data are not in the population projections
lang_countries = total_speakers_df.index.get_level_values(0).unique()
pop_proj_countries = pop_proj_df.index.get_level_values(0).unique()

lang_countries.difference(pop_proj_countries)

In [None]:
pop_proj_countries.difference(lang_countries)

In [None]:
# # Rename the countries in the population projections to match the language df
# pop_proj_rename_map = {"Bolivia (Plurinational State of)": "Bolivia", "Brunei Darussalam": "Brunei", "Cabo Verde": "Cape Verde Islands", "China, Hong Kong SAR": "Hong Kong", "China, Macao SAR": "Macao", "Curaçao": "Curacao", "Côte d'Ivoire": "Côte d’Ivoire", "Timor-Leste": "East Timor", "Iran (Islamic Republic of)": "Iran", "Lao People's Democratic Republic": "Laos", "Republic of Moldova": "Moldova", "State of Palestine": "Palestine", "Saint Martin (French part)": "Saint Martin", "Sint Maarten (Dutch part)": "Sint Maarten", "Republic of Korea": "South Korea", "Syrian Arab Republic": "Syria", "Sao Tome and Principe": "São Tomé e Príncipe", "United Republic of Tanzania": "Tanzania", "Türkiye": "Turkey", "United States Virgin Islands": "U.S. Virgin Islands", "United States of America": "United States", "Venezuela (Bolivarian Republic of)": "Venezuela", "Viet Nam": "Vietnam"}

# pop_proj_df.index = pop_proj_df.index.map(lambda x: (pop_proj_rename_map[x[0]],x[1]) if x[0] in pop_proj_rename_map else x)

# # Verify that all the countries in the language df are in the population projections df
# pop_proj_countries = pop_proj_df.index.get_level_values(0).unique()
# lang_countries.difference(pop_proj_countries)

In [None]:
# Aggregate the population projections by region

# Add the regions to the pop_proj_df
# Apply the mapping to the 0th level of the index
pop_proj_df = pop_proj_df.assign(Region=pop_proj_df.index.get_level_values(0).map(country_region_series))

# Sum the population projections by region
pop_proj_df = pop_proj_df.groupby(["Region", "Year"]).sum()

pop_proj_df

In [None]:
# Add population projections to the future_ratio_region_speakers_df's

# Total Speakers
future_ratio_region_total_speakers_df = future_ratio_region_total_speakers_df.assign(Population=pop_proj_df["Population"])

# L1 Speakers
future_ratio_region_l1_speakers_df = future_ratio_region_l1_speakers_df.assign(Population=pop_proj_df["Population"])

# L2 Speakers
future_ratio_region_l2_speakers_df = future_ratio_region_l2_speakers_df.assign(Population=pop_proj_df["Population"])

future_ratio_region_l2_speakers_df

In [None]:
# Convert the ratios to numbers

# Total Speakers
future_total_speakers_df = utils.create_future_speakers_df(future_ratio_region_total_speakers_df)

# L1 Speakers
future_l1_speakers_df = utils.create_future_speakers_df(future_ratio_region_l1_speakers_df)

# L2 Speakers
future_l2_speakers_df = utils.create_future_speakers_df(future_ratio_region_l2_speakers_df)

future_l2_speakers_df.loc["Western Europe"]

# Combine L1 and L2

In [None]:
# Combine L1 and L2 projections

# Fill zeros for the missing values in the L1 and L2 dfs
future_l1_speakers_df.fillna(0, inplace=True)
future_l2_speakers_df.fillna(0, inplace=True)

future_l1_l2_speakers_df = future_l1_speakers_df + future_l2_speakers_df
future_l1_l2_speakers_df["Population"] = future_l1_speakers_df["Population"]

future_l1_l2_speakers_df

# Plot the future!!

In [None]:
# Combine the region_total_speakers_df and the future_total_speakers_df
combined_speakers_df = pd.concat([region_total_speakers_df, future_l1_l2_speakers_df])
# sort the index
combined_speakers_df.sort_index(inplace=True)

combined_speakers_df.loc["Western Europe"]

In [None]:
# grab the names of the regions
regions = combined_speakers_df.index.get_level_values(0).unique()

# plot the regions
for region in regions:
    utils.plot_proj_region(region, lang_codes, lang_names, combined_speakers_df, "Number of speakers", "L1+L2 Projections", color_list)

In [None]:
# Combine the region_total_speakers_df and the future_total_speakers_df
combined_total_speakers_df = pd.concat([region_total_speakers_df, future_total_speakers_df])


# grab the names of the regions
regions = combined_speakers_df.index.get_level_values(0).unique()

# plot the regions
for region in regions:
    utils.plot_proj_region(region, lang_codes, lang_names, combined_total_speakers_df, "Number of speakers", "Total Projected Speakers", color_list)

# Prep Data for Exporting

In [None]:
combined_total_speakers_df

## Types of data

All of this will either be normalized by regional population or not.

1. Raw projections from the total number of speakers
2. Raw projections from L1+L2 (far less accurate, bad methodology)
3. Regional populations
4. Raw projections from total number of speakers, filtering out languages that had less than (1%? 3%? 5%?) in 2020


In [None]:
# Regional populations
region_pop_df = combined_total_speakers_df["Population"]

# reindex based on year, then region
region_pop_df.reorder_levels([1, "Region"]).loc[2050].to_csv("data/region_populations.csv") 