In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from math import nan

# Import Language Data

In [None]:
# import data
lang_df = pd.read_table("populations.tsv", sep="\t", header=None)

# rename columns
lang_df.rename(columns={0: "Language", 1: "Country", 2:16, 3:17, 4:18, 5:19, 6:20, 7:21, 8:22, 9:23, 10:25, 11:"drop"}, inplace=True)

# Strip whitespace from the Language and Country columns
lang_df["Language"] = lang_df["Language"].str.strip()
lang_df["Country"] = lang_df["Country"].str.strip()

# set the index to the first two columns
lang_df.set_index(["Language", "Country"], inplace=True)


lang_df

In [None]:
# view the first few indexes
lang_df.index[:5]

In [None]:
eval(lang_df[17][("eng", "American Samoa")])

In [None]:
# Grab just the total number of speakers (ie the first number in the tuple)

def extract_total_speakers(cell):

    if cell == 0:
        return 0
    
    # try to check if the cell is nan
    try:
        if np.isnan(cell):
            return 0
    except:
        pass

    eval_cell = eval(cell)
    if type(eval_cell) == tuple:
        return eval_cell[0]
    else:
        return eval_cell
    

# Create a new df where each cell has just the total number of speakers
total_speakers_df = lang_df.applymap(extract_total_speakers)
# drop the column labeled "drop"
total_speakers_df.drop("drop", axis=1, inplace=True)
total_speakers_df

# Import Population Data

In [None]:
pop_df = pd.read_csv("World_Population_Data.csv", header=2)

In [None]:
pop_df = pop_df.drop(columns = ["Indicator Name", "Indicator Code", "Country Code", "Unnamed: 67"])
pop_df = pop_df.set_index("Country Name")
# rename index to Country
pop_df.index.names = ["Country"]

pop_df.columns = pop_df.columns.astype(int)

In [None]:
pop_df

# Lang/pop v time

In [None]:
edition_to_year= {13:1996, 14: 2000, 15:2005, 16:2009, 17:2014, 18:2015, 19:2016, 20:2017, 21:2018, 22:2019, 23:2020, 24:2021, 25:2022}



total_speakers_df= total_speakers_df.rename(columns=edition_to_year)
total_speakers_df= total_speakers_df.stack()
total_speakers_df= total_speakers_df.unstack("Language") 

# name the second index "Year"
total_speakers_df.index.names = ["Country", "Year"]

total_speakers_df

In [None]:
# names of total_speakers_df index
total_speakers_df.index.names

### Check which country names align

In [None]:
lang_countries = total_speakers_df.index.get_level_values(0).unique()
pop_countries = pop_df.index.unique()

In [None]:
# find all the countries that only appear in the language df
lang_only_countries = lang_countries.difference(pop_countries)
lang_only_countries

In [None]:
# find all the countries that only appear in the population df
pop_only_countries = pop_countries.difference(lang_countries)
pop_only_countries

Resolving country differences

- Some will be easy, like `Palestine` vs `West Bank and Gaza`.
- Perhaps the World Bank data can be used for regional populations, since it seems to have regions
- As of 1/7/24 morning, I haven't taken any of this into account 

In [None]:
stacked_pop_df = pop_df.stack()
stacked_pop_df.index.names = ["Country", "Year"]
stacked_pop_df

In [None]:
total_speakers_df = total_speakers_df.assign(Population=stacked_pop_df)
total_speakers_df

In [None]:
# Pull in the country-region mapping
country_region_df = pd.read_csv("region.csv")
country_region_df = country_region_df.set_index("Country")
# convert to series
country_region_series = pd.Series(index=country_region_df.index, data=country_region_df["Region"])
country_region_series

In [None]:
# Add the regions to the total_speakers_df
# Apply the mapping to the 0th level of the index
total_speakers_df = total_speakers_df.assign(Region=total_speakers_df.index.get_level_values(0).map(country_region_series))
total_speakers_df

In [None]:
# Which countries have no region?
total_speakers_df[total_speakers_df["Region"].isnull()].index.get_level_values(0).unique()

### Dropped countries w/o regions for now
We should take more care with this later

In [None]:
# Drop the countries with no region
total_speakers_df = total_speakers_df[total_speakers_df["Region"].notnull()]
total_speakers_df

## Sum data by region

In [None]:
region_total_speakers_df = total_speakers_df.groupby(["Region", "Year"]).sum()
region_total_speakers_df

In [None]:
# for each language, create a new column with the fraction of the population that speaks that language

# create a new df with the fraction of the population that speaks each language
ratio_region_total_speakers_df = pd.DataFrame(index=region_total_speakers_df.index, columns=region_total_speakers_df.columns)

# for each language, create a new column with the fraction of the population that speaks that language
for lang in region_total_speakers_df.columns:
    ratio_region_total_speakers_df[lang] = region_total_speakers_df[lang] / region_total_speakers_df["Population"]

# drop the population column
ratio_region_total_speakers_df.drop("Population", axis=1, inplace=True)

ratio_region_total_speakers_df.head(20)

## Plot!!

In [None]:
def plot_region(region, lang_codes, lang_names, df, y_axis_label, plot_title, color_list):
    fig, ax = plt.subplots(figsize=(10, 4))

    regional_df = df.loc[region]

    for i, lang in enumerate(lang_codes):
        ax.scatter(regional_df.index, regional_df[lang], label=lang_names[lang], color=color_list[i])
        ax.plot(regional_df.index, regional_df[lang], color_list[i]) # remove this line to remove the lines connecting the points

    # move the legend outside the plot
    # https://builtin.com/data-science/matplotlib-legend-outside-plot 
    pos = ax.get_position()
    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
    ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

    fig.suptitle(f"{region} {plot_title}")
    ax.set_ylabel(f"{y_axis_label}")
    ax.set_xlabel("Year")
    plt.show()
    plt.close()

In [None]:
ratio_region_total_speakers_df.columns

In [None]:
regions = ratio_region_total_speakers_df.index.get_level_values(0).unique()
lang_codes = ratio_region_total_speakers_df.columns
lang_names = {"arz": "Arabic",	"ben": "Bengali","cmn": "Mandarin", "deu": "German",	"eng": "English",	"fra": "French",	"hin": "Hindi",	"ind": "Indonesian",	"jpn": "Japanese", 	"por": "Portuguese",	"rus": "Russian",	"spa": "Spanish",	"urd": "Urdu"}

color_list = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink", "tab:gray", "tab:olive", "tab:cyan", "m", "k", "lime"]

for region in regions:
    plot_region(region, lang_codes, lang_names, ratio_region_total_speakers_df, "Number of speakers / Population", "Total Speakers as a Fraction of Population", color_list)

# Log Odds version

In [None]:
log_odds_region_total_speakers_df = np.log(ratio_region_total_speakers_df / (1 - ratio_region_total_speakers_df))

# replace -inf with nan
log_odds_region_total_speakers_df.replace(-np.inf, np.nan, inplace=True)

log_odds_region_total_speakers_df.head(20)

In [None]:
# export log odds table to csv
log_odds_region_total_speakers_df.to_csv("draft_log_odds_data.csv")

In [None]:
for region in regions:
    plot_region(region, lang_codes, lang_names, log_odds_region_total_speakers_df, "log(ratio / (1 - ratio))", "Log Odds of Total Speakers as a Fraction of Population", color_list)

# Regression for Log Odds

In [None]:
def only_non_nan(x, y): 
    """Prepares x and y (vectors) for linear regression by removing missing values"""
    new_x = []
    new_y = []

    for x_val, y_val in zip(x, y):
        if not np.isnan(x_val) and not np.isnan(y_val):
            new_x.append(x_val)
            new_y.append(y_val)

    if len(new_x) == 0:
        return [np.nan], [np.nan] # return nan if there are no non-nan values
    
    else:
        return new_x, new_y

In [None]:
# Create a regression line for each region,language pair
# store the slope, intercept, and r-value in a dataframe
regression_total_speakers_df = pd.DataFrame(index=pd.MultiIndex.from_product([regions, lang_codes]), columns=["slope", "intercept", "r_value", "p_value", "std_err"])

for region in regions:
    for lang in lang_codes:
        x = log_odds_region_total_speakers_df.loc[region].index
        y = log_odds_region_total_speakers_df.loc[region][lang]

        x, y = only_non_nan(x, y)

        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

        regression_total_speakers_df.loc[(region, lang)] = [slope, intercept, r_value, p_value, std_err]

In [None]:
regression_total_speakers_df

## Plot with the regression lines 

In [None]:
def plot_region_regression(region, lang_codes, lang_names, df, regress_df, y_axis_label, plot_title, color_list, future_years=[2023,2050]):
    fig, ax = plt.subplots(figsize=(10, 4))

    regional_df = df.loc[region]

    for i, lang in enumerate(lang_codes):
        ax.scatter(regional_df.index, regional_df[lang], label=lang_names[lang], color=color_list[i])
        ax.plot(regional_df.index, regional_df[lang], color_list[i]) # remove this line to remove the lines connecting the points

        # add the regression line
        slope, intercept, r_value, p_value, std_err = regress_df.loc[(region, lang)]
        x = np.array(future_years)
        y = slope * x + intercept
        ax.plot(x, y, color_list[i], linestyle="dashed")

    # move the legend outside the plot
    # https://builtin.com/data-science/matplotlib-legend-outside-plot 
    pos = ax.get_position()
    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
    ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

    fig.suptitle(f"{region} {plot_title}")
    ax.set_ylabel(f"{y_axis_label}")
    ax.set_xlabel("Year")
    plt.show()
    plt.close()

In [None]:
for region in regions:
    plot_region_regression(region, lang_codes, lang_names, log_odds_region_total_speakers_df, regression_total_speakers_df,  "log(ratio / (1 - ratio))", "Log Odds of Total Speakers as a Fraction of Population", color_list)

# Convert Back to Language Speakers

Goal: A table with regional projections per language for 2025, 2030, 2035, 2040, 2045, 2050.

Break it down into baby steps
1. Use the regression lines to calculate values at 2025, 2030, 2035, 2040, 2045, 2050.
2. Undo the log-odds part to get a ratio
3. Use the projections to get a raw number
4. (BONUS) Graphically display which languages have big populations where 

## Regression numbers per 5 years

In [None]:
# create a dataframe with index regions, future years, columns as language codes

future_years = [2025, 2030, 2035, 2040, 2045, 2050]

future_log_odds_region_total_speakers_df = pd.DataFrame(index=pd.MultiIndex.from_product([regions, future_years]), columns=lang_codes)

for region in regions:
    for lang in lang_codes:
        slope, intercept, r_value, p_value, std_err = regression_total_speakers_df.loc[(region, lang)]
        x = np.array(future_years)
        y = slope * x + intercept
        future_log_odds_region_total_speakers_df.loc[(region, slice(None)), lang] = y

future_log_odds_region_total_speakers_df

## Undo Log Odds 

Formula: Let $r$ be the ratio (which we want at the end of this step) and $l$ be the log odds (which we have).
$$
r = \frac{e^l}{1+e^l} = \frac{1}{e^{-l}+1}
$$

In [None]:
# convert log odds to ratio, skipping all the nan values

future_ratio_region_total_speakers_df = 1 / (1 + np.exp(-future_log_odds_region_total_speakers_df.astype(float)))
future_ratio_region_total_speakers_df

# Undo ratios
Using population predictions

In [None]:
# Load in the population projections
pop_proj_df = pd.read_csv("population_projections.csv")
pop_proj_df = pop_proj_df.set_index(["Country", "Year"])
pop_proj_df

In [None]:
# Aggregate the population projections by region

# Add the regions to the pop_proj_df
# Apply the mapping to the 0th level of the index
pop_proj_df = pop_proj_df.assign(Region=pop_proj_df.index.get_level_values(0).map(country_region_series))

# Sum the population projections by region
pop_proj_df = pop_proj_df.groupby(["Region", "Year"]).sum()

pop_proj_df

In [None]:
# Add the population projections to the future_ratio_region_total_speakers_df
future_ratio_region_total_speakers_df = future_ratio_region_total_speakers_df.assign(Population=pop_proj_df["Population"])
future_ratio_region_total_speakers_df

In [None]:
# Get the total speakers
future_total_speakers_df = future_ratio_region_total_speakers_df * future_ratio_region_total_speakers_df["Population"]

# Fix the population column
# future_total_speakers_df["Population"] = np.sqrt(future_total_speakers_df["Population"])