In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from math import nan

# Import Language Data

In [None]:
# import data
lang_df = pd.read_table("populations.tsv", sep="\t", header=None)

# rename columns
lang_df.rename(columns={0: "Language", 1: "Country", 2:16, 3:17, 4:18, 5:19, 6:20, 7:21, 8:22, 9:23, 10:25, 11:"drop"}, inplace=True)

# Strip whitespace from the Language and Country columns
lang_df["Language"] = lang_df["Language"].str.strip()
lang_df["Country"] = lang_df["Country"].str.strip()

# set the index to the first two columns
lang_df.set_index(["Language", "Country"], inplace=True)


lang_df

In [None]:
# view the first few indexes
lang_df.index[:5]

In [None]:
eval(lang_df[17][("eng", "American Samoa")])

In [None]:
# Grab just the total number of speakers (ie the first number in the tuple)

def extract_total_speakers(cell):

    if cell == 0:
        return 0
    
    # try to check if the cell is nan
    try:
        if np.isnan(cell):
            return 0
    except:
        pass

    eval_cell = eval(cell)
    if type(eval_cell) == tuple:
        return eval_cell[0]
    else:
        return eval_cell
    

# Create a new df where each cell has just the total number of speakers
total_speakers_df = lang_df.applymap(extract_total_speakers)
total_speakers_df

# Import Population Data

In [None]:
pop_df = pd.read_csv("World_Population_Data.csv", header=2)

In [None]:
pop_df = pop_df.drop(columns = ["Indicator Name", "Indicator Code", "Country Code", "Unnamed: 67"])
pop_df = pop_df.set_index("Country Name")
# rename index to Country
pop_df.index.names = ["Country"]

pop_df.columns = pop_df.columns.astype(int)

In [None]:
pop_df

# Lang/pop v time

In [None]:
edition_to_year= {13:1996, 14: 2000, 15:2005, 16:2009, 17:2014, 18:2015, 19:2016, 20:2017, 21:2018, 22:2019, 23:2020, 24:2021, 25:2022}



total_speakers_df= total_speakers_df.rename(columns=edition_to_year)
total_speakers_df= total_speakers_df.stack()
total_speakers_df= total_speakers_df.unstack("Language") 

# name the second index "Year"
total_speakers_df.index.names = ["Country", "Year"]

total_speakers_df

In [None]:
# names of total_speakers_df index
total_speakers_df.index.names

### Check which country names align

In [None]:
lang_countries = total_speakers_df.index.get_level_values(0).unique()
pop_countries = pop_df.index.unique()

In [None]:
# find all the countries that only appear in the language df
lang_only_countries = lang_countries.difference(pop_countries)
lang_only_countries

In [None]:
# find all the countries that only appear in the population df
pop_only_countries = pop_countries.difference(lang_countries)
pop_only_countries

Resolving country differences

- Some will be easy, like `Palestine` vs `West Bank and Gaza`.
- Perhaps the World Bank data can be used for regional populations, since it seems to have regions
- As of 1/7/24 morning, I haven't taken any of this into account 

In [None]:
stacked_pop_df = pop_df.stack()
stacked_pop_df.index.names = ["Country", "Year"]
stacked_pop_df

In [None]:
total_speakers_df = total_speakers_df.assign(Population=stacked_pop_df)
total_speakers_df

In [None]:
# for each language, create a new column with the fraction of the population that speaks that language

# create a new df with the fraction of the population that speaks each language
ratio_total_speakers_df = pd.DataFrame(index=total_speakers_df.index, columns=total_speakers_df.columns)

# for each language, create a new column with the fraction of the population that speaks that language
for lang in total_speakers_df.columns:
    ratio_total_speakers_df[lang] = total_speakers_df[lang] / total_speakers_df["Population"]

ratio_total_speakers_df

In [None]:
populations_northern_europe= ["Finland", "Isle of Man", "Norway", "Denmark", "Sweden", "Estonia", "Iceland", "Latvia", "United Kingdom", "Ireland", "Lithuania"]
#No Faroe Islands

In [None]:
# ne= total_speakers_df.loc[populations_northern_europe]

In [None]:
def plot_region(populations,name)   :
    fig, ax = plt.subplots(figsize=(13, 4))

    for country in populations:
        print(country)
        ax.scatter(df_plotting.index, df_plotting[country] / 10**6, label = country)
        ax.plot(df_plotting.index, df_plotting[country] / 10**6) # remove this line to remove the lines connecting the points

    # move the legend outside the plot
    # https://builtin.com/data-science/matplotlib-legend-outside-plot 
    pos = ax.get_position()
    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
    ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

    fig.suptitle(name)
    ax.set_ylabel("Population (millions)")
    ax.set_xlabel("Year")
    plt.show()
    plt.close()