# Initial EDA

In [24]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [25]:
# Create and examine dataframe

df = pd.read_csv('../data/Literacy rates (no pw2).csv')

In [26]:
df.describe()

Unnamed: 0,index,Year,Literacy rate
count,4955.0,4955.0,4955.0
mean,2477.0,2013.987891,0.816947
std,1430.529622,2.746204,0.23256
min,0.0,2010.0,0.004025
25%,1238.5,2011.0,0.722507
50%,2477.0,2014.0,0.930676
75%,3715.5,2016.0,0.985078
max,4954.0,2018.0,1.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4955 entries, 0 to 4954
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          4955 non-null   int64  
 1   Region         4955 non-null   object 
 2   Country        4955 non-null   object 
 3   Year           4955 non-null   int64  
 4   Age            4955 non-null   object 
 5   Gender         4955 non-null   object 
 6   Literacy rate  4955 non-null   float64
dtypes: float64(1), int64(2), object(4)
memory usage: 271.1+ KB


In [28]:
df.head()

Unnamed: 0,index,Region,Country,Year,Age,Gender,Literacy rate
0,0,Central and Southern Asia,Afghanistan,2011,15+,female,0.176121
1,1,Central and Southern Asia,Afghanistan,2011,15+,male,0.454171
2,2,Central and Southern Asia,Afghanistan,2011,15+,total,0.317411
3,3,Central and Southern Asia,Afghanistan,2011,15-24,female,0.321132
4,4,Central and Southern Asia,Afghanistan,2011,15-24,male,0.618791


In [7]:
#Identifying which years are available in the data for analysis

unique_years = df['Year'].unique()
print(unique_years)  # To verify which years are available

[2011 2018 2012 2013 2014 2015 2016 2017 2010]


In [10]:
# Filter the DataFrame to include countries with data for both 2011 and 2018
required_years = [2011, 2018]
df_filtered = df.groupby('Country').filter(lambda x: set(x['Year']) == set(required_years))

# Now df_filtered contains data only for the countries with both 2011 and 2018 data
print(df_filtered['Country'].unique())  # Should show only countries with 2011 and 2018

['Afghanistan' 'India' 'Nepal' 'Brunei Darussalam' 'Italy' 'Malta'
 'Montenegro' 'Portugal' 'Romania' 'Costa Rica' 'Samoa' 'Tonga' 'Congo'
 'Namibia']


In [11]:
print(df_filtered['Region'].unique())

['Central and Southern Asia' 'Eastern and South-Eastern Asia'
 'Europe and Northern America' 'Latin America and the Caribbean' 'Oceania'
 'Sub-Saharan Africa']


In [20]:
# Filter the DataFrame to include countries with data for both 2011 and 2018
required_years_a = [2012, 2017]
df_filtered_a = df.groupby('Country').filter(lambda x: set(x['Year']) == set(required_years_a))

# Now df_filtered contains data only for the countries with both 2011 and 2018 data
print(df_filtered_a['Country'].unique())  # Should show only countries with 2011 and 2018

['Bhutan']


In [21]:
# Filter the DataFrame to include countries with data for both 2011 and 2018
required_years_b = [2013, 2018]
df_filtered_b = df.groupby('Country').filter(lambda x: set(x['Year']) == set(required_years_b))

# Now df_filtered contains data only for the countries with both 2011 and 2018 data
print(df_filtered_b['Country'].unique())  # Should show only countries with 2011 and 2018

['Iraq' 'Sierra Leone']


In [22]:
# Filter the DataFrame to include countries with data for both 2011 and 2018
required_years_c = [2014, 2017]
df_filtered_c = df.groupby('Country').filter(lambda x: set(x['Year']) == set(required_years_c))

# Now df_filtered contains data only for the countries with both 2011 and 2018 data
print(df_filtered_c['Country'].unique())  # Should show only countries with 2011 and 2018

['Georgia' 'Burundi']


In [33]:
# Filter the DataFrame to include countries with data for both 2011 and 2018
required_years_d = [2014, 2018]
df_filtered_d = df.groupby('Country').filter(lambda x: set(x['Year']) == set(required_years_d))

# Now df_filtered contains data only for the countries with both 2011 and 2018 data
print(df_filtered_d['Country'].unique())  # Should show only countries with 2011 and 2018

['Vanuatu' 'Burkina Faso' 'Kenya']


In [30]:
# Group the data by 'year' and count the number of unique countries for each year
country_count_by_year = df.groupby('Year')['Country'].nunique()
print(country_count_by_year)

Year
2010    50
2011    58
2012    44
2013    34
2014    56
2015    41
2016    34
2017    31
2018    72
Name: Country, dtype: int64


#### I cannot find a large set of countries that have data across multiple years, even if I just choose two years there are only 14 countries which match up.  Because of that I would like to move to another line of inquiry.

#### Exploring now based upon disparities between men and women in the age groups.

In [41]:
# Split the DataFrame into separate DataFrames for male and female
df_male = df[df['Gender'] == 'male'][['Region', 'Country', 'Year', 'Age', 'Literacy rate']]
df_female = df[df['Gender'] == 'female'][['Region', 'Country', 'Year', 'Age', 'Literacy rate']]

# Reset the index of the two DataFrames to make the merging easier
df_male.reset_index(drop=True, inplace=True)
df_female.reset_index(drop=True, inplace=True)

# Combine the two DataFrames using the 'zip' function or 'concat' method
# Using the zip function:
combined_df = pd.merge(df_male, df_female, on=['Country', 'Year', 'Age'], how='outer', suffixes=('_male', '_female'))
combined_df.drop(columns=['Region_female'], inplace=True)
combined_df.rename(columns={'Region_male': 'Region'}, inplace=True)

combined_df.head(20)


Unnamed: 0,Region,Country,Year,Age,Literacy rate_male,Literacy rate_female
0,Central and Southern Asia,Afghanistan,2011,15+,0.454171,0.176121
1,Central and Southern Asia,Afghanistan,2011,15-24,0.618791,0.321132
2,Central and Southern Asia,Afghanistan,2011,25-64,0.377948,0.084128
3,Central and Southern Asia,Afghanistan,2018,15+,0.554755,0.298052
4,Central and Southern Asia,Afghanistan,2018,15-24,0.740848,0.562547
5,Central and Southern Asia,Afghanistan,2018,25-64,0.45384,0.143314
6,Central and Southern Asia,Afghanistan,2018,65+,0.24745,0.034743
7,Central and Southern Asia,Bangladesh,2011,15+,0.624831,0.551194
8,Central and Southern Asia,Bangladesh,2011,15-24,0.764034,0.795445
9,Central and Southern Asia,Bangladesh,2011,25-64,0.585419,0.480653


In [42]:
# Create column which shows the difference between male and female rates:

combined_df['difference'] = combined_df['Literacy rate_male'] - combined_df['Literacy rate_female']

combined_df.head(20)

Unnamed: 0,Region,Country,Year,Age,Literacy rate_male,Literacy rate_female,difference
0,Central and Southern Asia,Afghanistan,2011,15+,0.454171,0.176121,0.27805
1,Central and Southern Asia,Afghanistan,2011,15-24,0.618791,0.321132,0.297659
2,Central and Southern Asia,Afghanistan,2011,25-64,0.377948,0.084128,0.293821
3,Central and Southern Asia,Afghanistan,2018,15+,0.554755,0.298052,0.256702
4,Central and Southern Asia,Afghanistan,2018,15-24,0.740848,0.562547,0.178301
5,Central and Southern Asia,Afghanistan,2018,25-64,0.45384,0.143314,0.310526
6,Central and Southern Asia,Afghanistan,2018,65+,0.24745,0.034743,0.212707
7,Central and Southern Asia,Bangladesh,2011,15+,0.624831,0.551194,0.073636
8,Central and Southern Asia,Bangladesh,2011,15-24,0.764034,0.795445,-0.031411
9,Central and Southern Asia,Bangladesh,2011,25-64,0.585419,0.480653,0.104766


In [43]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1652 entries, 0 to 1651
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Region                1651 non-null   object 
 1   Country               1652 non-null   object 
 2   Year                  1652 non-null   int64  
 3   Age                   1652 non-null   object 
 4   Literacy rate_male    1651 non-null   float64
 5   Literacy rate_female  1652 non-null   float64
 6   difference            1651 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 103.2+ KB


In [None]:
# Create two separate line charts for male and female literacy rates
def create_line_chart(df_gender, title):
    fig, ax = plt.subplots(figsize=(10, 6))

    # Get unique countries and age groups
    unique_countries = df_gender['Country'].unique()
    unique_ages = df_gender['Age'].unique()

    # Plot lines for each country and age group
    for country in unique_countries:
        for age in unique_ages:
            df_subset = df_gender[(df_gender['Country'] == country) & (df_gender['Age'] == age)]
            ax.plot(df_subset['Year'], df_subset['Literacy rate'], label=f'{country} - {age}')

    ax.set_xlabel('Year')
    ax.set_ylabel('Literacy Rate')
    ax.set_title(title)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create line chart for male literacy rates
create_line_chart(df_male, 'Male Literacy Rates by Country and Age')

# Create line chart for female literacy rates
create_line_chart(df_female, 'Female Literacy Rates by Country and Age')

#### These charts work but they're an eyesore.  Include the age ranges with all of the countries is just too noisy to work with.