# Explore historical language data for top languages

Come up with a better method to pre-process the data once we have the large data set (probably one person creates the file, emails it, and the file type is git-ignored).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats

from math import nan

## Import data

1. Import the spreadsheet (downloaded from google sheets)
2. Re-index appropriately, and drop columns with no data
3. Separate out L1 and L2, where possible

In [None]:
# import data
df = pd.read_table("misc_data/manual-data-nov5.tsv", sep="\t")

df

In [None]:
# Clean up the table
df = df.drop(columns = ["1996"])

In [None]:
# Re-index based on the first column, which is language. 
# rename the first column to "language"
df = df.rename(columns = {"Unnamed: 0": "Language"})
df = df.set_index("Language")

# cast column titles to integers
df.columns = df.columns.astype(int)

df

In [None]:
# Separate out L1 and L2 and total

# Create a little function to parse each cell
def separate_values(entry):
    """ Separate data in cell into L1, L2 and total. 
    Inputs a string that may contain a single number, two numbers, or none.
    Outputs l1, l2 and total as strings or np.nan if not present. """


    if type(entry) is not str:
        if np.isnan(entry):
            l1 = np.nan
            l2 = np.nan
            total = np.nan

            return l1, l2, total
        else:
            raise TypeError("Entry is not a string or NaN")
        
    entry = entry.replace(' ', '')
    
    if entry[-1] == ";":
        # only l1 speakers specified
        l1 = int(entry[:-1].replace(",", ""))
        l2 = np.nan
        total = l1

        return l1, l2, total
    
    else:
        split_entry = entry.split(";")

        if len(split_entry) == 1:
            # only total number of speakers specified
            l1 = np.nan
            l2 = np.nan
            total = int(split_entry[0].replace(",", ""))

            return l1, l2, total
        
        elif len(split_entry) == 2:
            # l1 and l2 speakers specified

            l1 = int(split_entry[0].replace(",", ""))
            l2 = int(split_entry[1].replace(",", ""))
            total = l1+l2

            return l1, l2, total

In [None]:
entry = "123,456; 789,012"

entry = entry.replace(' ', '')
entry

In [None]:
# test the parsing function
print(separate_values("1,2; 2,3"))
print(separate_values("1;"))
print(separate_values("1"))
print(separate_values(np.nan))
print(separate_values("123,456; 789,012"))


In [None]:
# Create new dataframes for L1, L2 and total

L1 = pd.DataFrame(index = df.index, columns = df.columns)
L2 = pd.DataFrame(index = df.index, columns = df.columns)
total = pd.DataFrame(index = df.index, columns = df.columns)

# Fill in the dataframes
for index, row in df.iterrows():
    for column in df.columns:
        l1, l2, tot = separate_values(row[column])
        L1.loc[index, column] = l1
        L2.loc[index, column] = l2
        total.loc[index, column] = tot

# All modern standard arabic speakers are L2 speakers
L2.loc["Modern Standard Arabic", :] = total.loc["Modern Standard Arabic", :]


In [None]:
L1

In [None]:
L2

In [None]:
total

## Plot data 

Just to get an overview of L1, L2 and total

In [None]:
# Easier to extract data when the languages are the columns
L1_plotting = L1.transpose()
L2_plotting = L2.transpose()
total_plotting = total.transpose()

languages = L1_plotting.columns

In [None]:
L1_plotting

In [None]:
# Choose the number of langauges (top based on 2023 data)
num_languages = 15

In [None]:
# Plot the L1 data

fig, ax = plt.subplots(figsize=(10, 4))

for i in range(num_languages):
    ax.scatter(L1_plotting.index, L1_plotting[languages[i]] / 10**6, label = languages[i])
    ax.plot(L1_plotting.index, L1_plotting[languages[i]] / 10**6) # remove this line to remove the lines connecting the points

# move the legend outside the plot
# https://builtin.com/data-science/matplotlib-legend-outside-plot 
pos = ax.get_position()
ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

fig.suptitle("L1 Speakers")
ax.set_ylabel("Number of speakers (millions)")
ax.set_xlabel("Year")
plt.show()
plt.close()

In [None]:
# Compute per-language standard deviation of L1 speakers
L1_cv = L1.std(axis = 1) / L1.mean(axis = 1)
L1_cv

In [None]:
# Plot the L2 data

fig, ax = plt.subplots(figsize=(10, 4))

for i in range(num_languages):
    ax.scatter(L2_plotting.index, L2_plotting[languages[i]] / 10**6, label = languages[i])
    ax.plot(L2_plotting.index, L2_plotting[languages[i]] / 10**6) # remove this line to remove the lines connecting the points

# move the legend outside the plot
# https://builtin.com/data-science/matplotlib-legend-outside-plot
pos = ax.get_position()
ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

fig.suptitle("L2 Speakers")
ax.set_ylabel("Number of speakers (millions)")
ax.set_xlabel("Year")
plt.show()
plt.close()

In [None]:
# Plot the total number of speakers

fig, ax = plt.subplots(figsize=(10, 4))

for i in range(num_languages):
    ax.scatter(total_plotting.index, total_plotting[languages[i]] / 10**6, label = languages[i])
    ax.plot(total_plotting.index, total_plotting[languages[i]] / 10**6) # remove this line to remove the lines connecting the points

# move the legend outside the plot
# https://builtin.com/data-science/matplotlib-legend-outside-plot
pos = ax.get_position()
ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

fig.suptitle("Total Number of Speakers")
ax.set_ylabel("Number of speakers (millions)")
ax.set_xlabel("Year")
plt.show()
plt.close()


## Linear Model

Use linear regression to find a line of best fit, extend 20 years in the future.

In [None]:
# testing out linear regression

x = [1, 2, 3, 4, 5]
y = [1.1, 1.9, 3.2, 4.1, 5.2]

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
slope

In [None]:
def only_non_nan(x, y): 
    """Prepares x and y (vectors) for linear regression by removing missing values"""
    new_x = []
    new_y = []

    for x_val, y_val in zip(x, y):
        if not np.isnan(x_val) and not np.isnan(y_val):
            new_x.append(x_val)
            new_y.append(y_val)

    if len(new_x) == 0:
        return [np.nan], [np.nan] # return nan if there are no non-nan values
    
    else:
        return new_x, new_y

In [None]:
# testing out linear regression

x = [1, np.nan, 3, 4, 5]
y = [1.1, 1.9, np.nan, 4.1, 5.2]

# x = [np.nan, np.nan, np.nan, np.nan, np.nan]
# y = [np.nan, np.nan, np.nan, np.nan, np.nan]

slope, intercept, r_value, p_value, std_err = stats.linregress(only_non_nan(x,y))
slope

In [None]:
# Use stats.linregress to find a line of best fit with the data we have

# Create a dictionary to store the slope and intercept for each language
L1_regression_lines = {}

for language in languages:
    x = L1_plotting.index.to_list()
    y = L1_plotting[language].to_list()
    # check that y has some non-nan values
    new_x, new_y = only_non_nan(x, y)

    slope, intercept, r_value, p_value, std_err = stats.linregress(new_x, new_y)
    L1_regression_lines[language] = [slope, intercept]

L2_regression_lines = {}

for language in languages:
    x = L2_plotting.index.to_list()
    y = L2_plotting[language].to_list()
    # check that y has some non-nan values
    new_x, new_y = only_non_nan(x, y)

    slope, intercept, r_value, p_value, std_err = stats.linregress(new_x, new_y)
    L2_regression_lines[language] = [slope, intercept]

total_regression_lines = {}

for language in languages:
    x = total_plotting.index.to_list()
    y = total_plotting[language].to_list()
    # check that y has some non-nan values
    new_x, new_y = only_non_nan(x, y)

    slope, intercept, r_value, p_value, std_err = stats.linregress(new_x, new_y)
    total_regression_lines[language] = [slope, intercept]

In [None]:
# Plots with regression lines and projections

# Plan
# 1. Plot the data (using scatter)
# 2. Plot the regression line (using plot)
# 3. Plot the projection (using plot, dashed line)

In [None]:
num_languages = 10
color_list = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink", "tab:gray", "tab:olive", "tab:cyan"]

past_years = [2000, 2022]
future_years = [2023, 2050]

In [None]:
# print out the L1 regression line dictionary
# Grrrr too many nans 
L1_regression_lines

In [None]:
# Define a function to plot the data, regression line and projection

def plot_linear_projection(df_plotting, dict_regression_lines, name, languages=languages, num_languages=num_languages, color_list=color_list, past_years=past_years, future_years=future_years):
    fig, ax = plt.subplots(figsize=(10, 4))

    for i in range(num_languages):
        ax.scatter(df_plotting.index, df_plotting[languages[i]] / 10**6, label = languages[i], color = color_list[i])

        # plot the regression line
        slope, intercept = dict_regression_lines[languages[i]]
        past_speakers = [(slope * year + intercept) / 10**6 for year in past_years]
        future_speakers = [(slope * year + intercept) / 10**6 for year in future_years]

        ax.plot(past_years, past_speakers, color = color_list[i])
        ax.plot(future_years, future_speakers, linestyle="dashed", color=color_list[i])

    # move the legend outside the plot
    # https://builtin.com/data-science/matplotlib-legend-outside-plot

    pos = ax.get_position()
    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
    ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

    fig.suptitle(name)
    ax.set_ylabel("Number of speakers (millions)")
    ax.set_xlabel("Year")
    plt.show()
    plt.close()

In [None]:
# Plot the L1 data and projections

fig, ax = plt.subplots(figsize=(10, 4))

for i in range(num_languages):
    ax.scatter(L1_plotting.index, L1_plotting[languages[i]] / 10**6, label = languages[i], color = color_list[i])

    # plot the regression line
    slope, intercept = L1_regression_lines[languages[i]]
    past_speakers = [(slope * year + intercept) / 10**6 for year in past_years]
    future_speakers = [(slope * year + intercept) / 10**6 for year in future_years]

    ax.plot(past_years, past_speakers, color = color_list[i])
    ax.plot(future_years, future_speakers, linestyle="dashed", color=color_list[i])

# move the legend outside the plot
# https://builtin.com/data-science/matplotlib-legend-outside-plot

pos = ax.get_position()
ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

fig.suptitle("L1 Speakers")
ax.set_ylabel("Number of speakers (millions)")
ax.set_xlabel("Year")
plt.show()
plt.close()


    


In [None]:
plot_linear_projection(L1_plotting, L1_regression_lines, "L1 Speakers")

In [None]:
plot_linear_projection(L2_plotting, L2_regression_lines, "L2 Speakers")

In [None]:
plot_linear_projection(total_plotting, total_regression_lines, "Total Speakers")