# Check for missing data

Problem: We noticed that East Asia had very very few Mandarin speakers. Turns out that we were missing China! 

Goal: Look for potential other instances of this problem.

Suspicion: We're missing speakers in a country where the language "originated". Missing Mandarin in China, English in the UK, Spanish in Spain, etc

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os

In [None]:
# Read in the total speakers cleaned data

speakers_df = pd.read_csv('../data/cleaned_total_speakers.csv')

# Set country and year as index
speakers_df.set_index(['Year', "Country"], inplace=True)

speakers_df

In [None]:
# Get the total number of speakers for each language from year 2022

most_recent_speakers = speakers_df.loc[2022]

total_speakers = most_recent_speakers.sum(axis=0)
total_speakers

In [None]:
# import manual total speakers data  
manual_total_speakers = pd.read_csv('../misc_data/manual_total_speakers.csv')
manual_total_speakers.set_index('Language', inplace=True)
manual_total_speakers

In [None]:
# rename the languages in total_speakers to match the manual_total_speakers

language_dict = {"arz": "Egyptian Arabic", "ben": "Bengali", "cmn": "Mandarin Chinese", "deu": "German", "eng": "English", "fra": "French", "hin": "Hindi", "ind": "Indonesian", "jpn": "Japanese", "por": "Portuguese", "rus": "Russian", "spa": "Spanish", "urd": "Urdu"}

total_speakers.rename(language_dict, inplace=True)
total_speakers

In [None]:
# add the total_speakers to the manual_total_speakers
manual_total_speakers['Total Speakers'] = total_speakers
manual_total_speakers

In [None]:
# Compute the difference between 2022 manual_total_speakers and the most recent data
manual_total_speakers['Difference'] = manual_total_speakers['2022'] - manual_total_speakers['Total Speakers']

manual_total_speakers['Difference']