In [32]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

# Load the datasets
file1_path = 'educ_data.csv'
file2_path = 'gini_data.csv'
file3_path = 'unemp_data.csv'
file4_path = 'world_bank_data.csv'

educ_df = pd.read_csv(file1_path)
gini_df = pd.read_csv(file2_path)
unemp_df = pd.read_csv(file3_path)
world_bank_df = pd.read_csv(file4_path)

In [None]:
id_vars = ['Country Name', 'Country Code', 'Series Name', 'Series Code']  
year_columns = world_bank_df.columns[4:]  # All the year columns, starting from the 5th column onward

# Melt the dataframe to bring year columns into rows
world_bank_df = pd.melt(world_bank_df, id_vars=id_vars, value_vars=year_columns, var_name='Year', value_name='Value')

# Drop rows with no data
world_bank_df = world_bank_df[world_bank_df['Value'] != '..']

# Convert 'Year' to a numerical type if needed (strip out the text around it, like 'YR1960' to just '1960')
world_bank_df['Year'] = world_bank_df['Year'].str.extract('(\d+)').astype(int)
world_bank_df['Value'] = pd.to_numeric(world_bank_df['Value'], errors='coerce')

# Pivot the DataFrame without dropping the Year column
world_bank_df_pivot = world_bank_df.pivot_table(index=['Country Name', 'Country Code', 'Year'], 
                                                 columns='Series Name', 
                                                 values='Value', 
                                                 fill_value=0).reset_index()

world_bank_df_pivot.head()

Series Name,Country Name,Country Code,Year,Control of Corruption: Estimate,Government Effectiveness: Estimate,High-technology exports (% of manufactured exports),Individuals using the Internet (% of population),"Individuals using the Internet, female (% of female population)","Individuals using the Internet, male (% of male population)",Mobile cellular subscriptions (per 100 people),...,Population density (people per sq. km of land area),Population in the largest city (% of urban population),Population in urban agglomerations of more than 1 million (% of total population),Population living in slums (% of urban population),Refugee population by country or territory of asylum,Rule of Law: Estimate,"Share of youth not in education, employment or training, female (% of female youth population)","Share of youth not in education, employment or training, male (% of male youth population)","Share of youth not in education, employment or training, total (% of youth population)",Social contributions (current LCU)
0,Afghanistan,AFG,1960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,39.392965,3.309401,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,AFG,1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.477056,39.348203,3.416999,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Afghanistan,AFG,1962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.751356,39.273621,3.525202,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,AFG,1963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.040239,39.18189,3.63451,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,AFG,1964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.343888,39.069378,3.745192,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# cleaning the main data sets

# Clean unemp_df
id_vars = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']  
year_columns = unemp_df.columns[4:]  # All the year columns, starting from the 5th column onward

# Melt the dataframe to bring year columns into rows
unemp_df = pd.melt(unemp_df, id_vars=id_vars, value_vars=year_columns, var_name='Year', value_name='Unemp %')

# Drop rows with no data
unemp_df = unemp_df.dropna(subset=['Unemp %'])

# Convert 'Year' to a numerical type if needed (strip out the text around it, like 'YR1960' to just '1960')
unemp_df['Year'] = unemp_df['Year'].str.extract('(\d+)').astype(int)

# Clean educ_df
id_vars = ['Country Name', 'Country Code', 'Series Name', 'Series Code']  
year_columns = educ_df.columns[4:]  # All the year columns, starting from the 5th column onward

# Melt the dataframe to bring year columns into rows
educ_df = pd.melt(educ_df, id_vars=id_vars, value_vars=year_columns, var_name='Year', value_name='Average years of schooling')

# Drop rows with no data
educ_df = educ_df[educ_df['Average years of schooling'] != '..']

# Convert 'Year' to a numerical type if needed (strip out the text around it, like 'YR1960' to just '1960')
educ_df['Year'] = educ_df['Year'].str.extract('(\d+)').astype(int)

# Clean gini_df

gini_df["iso"] = gini_df["iso"].transform(lambda x: x.upper())
gini_df['gini_recalculated'] = pd.to_numeric(gini_df['gini_recalculated'], errors='coerce')
gini_df = gini_df.dropna(subset=['gini_recalculated'])