# Ratio Model 

We're going to combine the historic language data with historic population data using a ratio and then taking the log odds. Then, we'll project into the future linearly, undo the log odds, and multiply by UN population projections to get the 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os

from math import nan

## Bring in data

Most of this data was preprocessed by scripts in the folder `data_cleaning_notebooks` with raw data from the `raw_data` folder.

In [None]:
# Load in files and set first two colums as index

total_speakers_df = pd.read_csv(os.path.join("data", "cleaned_total_speakers.csv"))
total_speakers_df = total_speakers_df.set_index(["Country", "Year"])
l1_speakers_df = pd.read_csv(os.path.join("data", "cleaned_l1_speakers.csv"))
l1_speakers_df = l1_speakers_df.set_index(["Country", "Year"])
l2_speakers_df = pd.read_csv(os.path.join("data", "cleaned_l2_speakers.csv"))
l2_speakers_df = l2_speakers_df.set_index(["Country", "Year"])

total_speakers_df

In [None]:
pop_df = pd.read_csv(os.path.join("raw_data", "World_Population_Data.csv"), header=2)

In [None]:
pop_df = pop_df.drop(columns = ["Indicator Name", "Indicator Code", "Country Code", "Unnamed: 67"])
pop_df = pop_df.set_index("Country Name")
# rename index to Country
pop_df.index.names = ["Country"]

pop_df.columns = pop_df.columns.astype(int)
pop_df

### Check which country names don't align

Different data sources use different names, next time we'll use the ISO country codes.

In [None]:
lang_countries = total_speakers_df.index.get_level_values(0).unique()
pop_countries = pop_df.index.unique()

len(lang_countries), len(pop_countries)

In [None]:
# find all the countries that only appear in the language df
lang_only_countries = lang_countries.difference(pop_countries)
lang_only_countries

In [None]:
# find all the countries that only appear in the population df
pop_only_countries = pop_countries.difference(lang_countries)
pop_only_countries

In [None]:
pop_to_lang_country_map = {"Bahamas, The": "Bahamas", "Brunei Darussalam": "Brunei", "Cabo Verde": "Cape Verde Islands", "Hong Kong SAR, China": "Hong Kong", "Macao SAR, China": "Macao", "Congo, Rep.": "Congo", "Congo, Dem. Rep.": "Democratic Republic of the Congo", "Cote d'Ivoire": "Côte d’Ivoire", "Timor-Leste": "East Timor", "Egypt, Arab Rep.": "Egypt", "Gambia, The": "Gambia", "Iran, Islamic Rep.": "Iran", "Kyrgyz Republic": "Kyrgyzstan", "Lao PDR": "Laos", "West Bank and Gaza": "Palestine", 'St. Kitts and Nevis': "Saint Kitts and Nevis", "St. Lucia": "Saint Lucia", "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", "St. Martin (French part)": "Saint Martin", "Sint Maarten (Dutch part)": "Sint Maarten", "Slovak Republic": "Slovakia", "Korea, Rep.": "South Korea", "Syrian Arab Republic": "Syria", "Sao Tome and Principe": "São Tomé e Príncipe", "Turkiye": "Turkey", "Virgin Islands (U.S.)": "U.S. Virgin Islands", "Venezuela, RB": "Venezuela", "Viet Nam": "Vietnam", "Yemen, Rep.": "Yemen"}

# not included
# Anguilla, British Indian Ocean Territory, Caribbean Netherlands, Taiwan, Christmas Island, Cocos (Keeling) Islands, Cook Islands, Falkland Islands, French Guiana, Guadaloupe, Guernsey, Martinique, Mayotte, Niue, Norfolk Island, Réunion, Saint Barthélemy, Saint Helena, Saint Helena, Ascension, and Tristan da Cunha, Saint Pierre and Miquelon, Taiwan, Tokelau, Wallis and Futuna, Western Sahara

# ISSUES - fixed in the original Ethnologue processing `data_cleaning_notebooks/extract_ethnologue_data.ipynb`
# Czechia, it changed named and messed up the data
# Eswatini also changed name!!
# Vietnam changed from Viet Nam to Vietnam

# Rename the countries in the population df to match the language df
pop_df.index = pop_df.index.map(lambda x: pop_to_lang_country_map[x] if x in pop_to_lang_country_map else x)

# verify that all the countries in the language df are in the population df
pop_countries = pop_df.index.unique()
lang_countries.difference(pop_countries)

### Combine Population and Language Data



In [None]:
stacked_pop_df = pop_df.stack()
stacked_pop_df.index.names = ["Country", "Year"]
stacked_pop_df

In [None]:
total_speakers_df = total_speakers_df.assign(Population=stacked_pop_df)
l1_speakers_df = l1_speakers_df.assign(Population=stacked_pop_df)
l2_speakers_df = l2_speakers_df.assign(Population=stacked_pop_df)

l1_speakers_df

### Country-Region Mapping

In [None]:
country_region_df = pd.read_csv(os.path.join("data", "regions.csv"))
country_region_df = country_region_df.set_index("Country")
# convert to series
country_region_series = pd.Series(index=country_region_df.index, data=country_region_df["Region"])
country_region_series

In [None]:
# Add the regions to the total_speakers_df
# Apply the mapping to the 0th level of the index
total_speakers_df = total_speakers_df.assign(Region=total_speakers_df.index.get_level_values(0).map(country_region_series))

# Add the regions to the l1_speakers_df
l1_speakers_df = l1_speakers_df.assign(Region=l1_speakers_df.index.get_level_values(0).map(country_region_series))

# Add the regions to the l2_speakers_df
l2_speakers_df = l2_speakers_df.assign(Region=l2_speakers_df.index.get_level_values(0).map(country_region_series))

l1_speakers_df

In [None]:
# Which countries have no region?
total_speakers_df[total_speakers_df["Region"].isnull()].index.get_level_values(0).unique()

In [None]:
# Drop the countries with no region
total_speakers_df = total_speakers_df[total_speakers_df["Region"].notnull()]
l1_speakers_df = l1_speakers_df[l1_speakers_df["Region"].notnull()]
l2_speakers_df = l2_speakers_df[l2_speakers_df["Region"].notnull()]

l1_speakers_df