In [2]:
import pandas as pd
import os


In [3]:
countries_base = ["Hungary", "Slovakia", "Romania", "Poland", "Croatia", "Serbia", "Montenegro", "Albania", "Czechia", "Bulgaria", "Greece", "North Macedonia"
             , "Moldova", "Lithuania", "Latvia", "Bosnia and Herzegovina"]
countries_ref = ["Germany", "Austria", "Switzerland", "Slovenia"]

countries = countries_base + countries_ref

In [4]:
literacy_data = pd.read_csv('datasets/literacy.csv')
gini_data = pd.read_csv('datasets/gini.csv')
poverty_data = pd.read_csv('datasets/poverty.csv')
mensa_data = pd.read_csv('datasets/mensa.csv').drop(columns = 'MensaMembersByCountryMembershipFee')
population_data = pd.read_csv('datasets/population.csv')
public_transport_data = pd.read_csv('datasets/public_transport.csv')
electricity_data = pd.read_csv('datasets/electricity.csv')


In [5]:
def clean_and_filter(df):
    # Filter columns for years above 1990
    df = df.loc[:, df.columns > '1990']

    df = df.drop(columns=['Indicator Name', 'Indicator Code', 'Unnamed: 68'], errors='ignore')

    df = df.rename(columns={'Country Name': 'country', 'Country Code': 'alpha3'})
    
    # Calculate the threshold for dropping rows (20% of the total number of columns)
    threshold = df.shape[1] * 0.2
    
    # Drop rows where more than 80% of the cells are NaN
    df = df.dropna(thresh=threshold)
    
    return df

In [6]:
#literacy_data = literacy_data.drop(columns = ['Indicator Name', 'Indicator Code', 'Unnamed: 68'])

literacy_data = clean_and_filter(literacy_data)

literacy_data

Unnamed: 0,country,alpha3,1991,1992,1993,1994,1995,1996,1997,1998,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
1,Africa Eastern and Southern,AFE,58.141190,58.530640,58.912601,59.526852,59.921940,60.434929,61.174061,61.408710,...,69.834770,70.313423,71.095200,71.009071,71.392616,72.634972,72.785622,72.581161,72.600403,
3,Africa Western and Central,AFW,44.823318,45.163380,46.002171,46.869511,47.017632,47.806450,48.828880,50.175579,...,54.006760,54.818321,55.437920,56.485538,59.568459,59.511719,59.617512,60.034611,60.312698,
5,Albania,ALB,,,,,,,,,...,,,,,,,,,98.500000,
7,Arab World,ARB,55.813911,56.863861,57.682941,58.448780,59.594292,60.881039,61.828331,63.036888,...,77.577873,74.981209,76.386726,77.170303,74.286133,74.603661,75.022881,75.231178,75.171532,
10,Armenia,ARM,,,,,,,,,...,,,99.744408,99.736069,,,99.788612,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,Viet Nam,VNM,,,,,,,,,...,,,,,,95.753868,,,96.133263,
258,Vanuatu,VUT,,,,68.699997,,,,,...,84.699997,,,,,,,89.099998,,
259,World,WLD,75.198761,75.645752,76.059624,76.462769,76.868942,77.258171,79.248680,80.433929,...,85.433708,85.601830,86.061157,86.288231,86.339050,86.489601,86.711510,86.852753,87.011749,
263,South Africa,ZAF,,,,,,82.402100,,,...,94.139900,94.367920,,87.046669,,95.022972,,90.001358,,


In [7]:
gini_data = clean_and_filter(gini_data)

In [8]:
global_data = pd.read_csv('datasets/world-data-2023.csv')

In [9]:
electricity_data = clean_and_filter(electricity_data)

In [10]:
from numpy import astype


mensa_data.rename(columns={'MensaMembersByCountry': 'members'}, inplace=True)
mensa_data.dropna(inplace=True) 
mensa_data['members'] = mensa_data['members'].astype(int)
mensa_data


Unnamed: 0,country,members
0,India,1500
1,United States,50000
2,Indonesia,175
3,Pakistan,250
4,Brazil,1300
5,Mexico,400
6,Japan,4500
7,Philippines,160
9,Germany,16000
10,United Kingdom,19000


In [11]:
public_transport_data = clean_and_filter(public_transport_data)

In [12]:
population_data = clean_and_filter(population_data)

In [13]:
datasets = [literacy_data, gini_data, poverty_data, mensa_data, population_data, public_transport_data, electricity_data]

os.makedirs('processed_datasets', exist_ok=True)

# List of dataset names for exporting
dataset_names = ['literacy_data', 'gini_data', 'poverty_data', 'mensa_data', 'population_data', 'public_transport_data', 'electricity_data']

# Export each dataset to a CSV file in the processed_datasets folder
for dataset, name in zip(datasets, dataset_names):
    dataset.to_csv(f'processed_datasets/{name}.csv', index=False)