<a href="https://colab.research.google.com/github/tomik062/AI_Project/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

extract data from github and merge together

In [1]:
import requests
import os
import importlib.util

# Define the GitHub details
repo_owner = 'tomik062'
repo_name = 'AI_Project'
file_path = 'extract_data.py'
url = f'https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main/{file_path}'

# Directory to save the downloaded file
download_dir = 'data_extraction_code'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

local_file_path = os.path.join(download_dir, file_path)

# Download the file
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

with open(local_file_path, 'wb') as f:
    f.write(response.content)

print(f"Downloaded {file_path} to {local_file_path}")

# Import the function from the downloaded file
spec = importlib.util.spec_from_file_location("extract_data_module", local_file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

# Call the extract_data function and print the output
data_output = module.extract_data()

Downloaded extract_data.py to data_extraction_code/extract_data.py
Downloaded GNI_PPP_const_2021_dollars.csv to data/GNI_PPP_const_2021_dollars.csv
Downloaded Religious-Composition-percentages.csv to data/Religious-Composition-percentages.csv
Downloaded abortion-rates-by-country-2025.csv to data/abortion-rates-by-country-2025.csv
Downloaded age-at-first-marriage-by-country-2025.csv to data/age-at-first-marriage-by-country-2025.csv
Downloaded average-workweek-by-country-2025.csv to data/average-workweek-by-country-2025.csv
Downloaded female-labor-force-participation-rates-slopes.csv to data/female-labor-force-participation-rates-slopes.csv
Downloaded gender-inequality-index.xlsx to data/gender-inequality-index.xlsx
Downloaded happiness-cantril-ladder.csv to data/happiness-cantril-ladder.csv
Downloaded healthcare-expenditure-per-capita-ppp.csv to data/healthcare-expenditure-per-capita-ppp.csv
Downloaded high_education_female.xlsx to data/high_education_female.xlsx
Downloaded high_educati

In [2]:
import pandas as pd
import numpy as np
def calc_features_year(data,year):
    feature_map = {
        'urbanization': 0,
        'avg birth age': 1,
        'happiness': 2,
        'female labor participation': 3,
        'health expenditure': 4,
        'physicians per capita': 5,
        'GNI PPP': 6,
        'christians': 7,
        'muslims': 8,
        'no religion': 9,
        'buddhists': 10,
        'hindus': 11,
        'jews': 12,
        'other religion': 13,
        'in Asia-Pacific':14,
        'in Europe':15,
        'in Latin America-Caribbean':16,
        'in Middle East-North Africa':17,
        'in North America':18,
        'in Sub-Saharan Africa':19,
        'education man': 20,
        'education women': 21,
        'hdi': 22,
        'gii': 23,
        'avg marriage age women': 24,
        'avg marriage age men': 25,
        'maternity leave index': 26,
        'work hours men': 27,
        'work hours women': 28,
        'abortions': 29,
        'social media users': 30
    }
    countries = sorted(list(set([key[0] for key in data.keys()])))
    df = pd.DataFrame(index=countries, columns=feature_map.keys())

    # Populate the DataFrame
    for country in countries:
        key = (country, year)
        if key in data:
            features = data[key][1]
            for feature_name, feature_index in feature_map.items():
                if feature_index < len(features):
                  df.loc[country, feature_name] = features[feature_index]
                  if str(features[feature_index]) =='nan':
                    df.loc[country, feature_name]=handle_missing_values(data,feature_index,country,year)
    return df



def handle_missing_values(data, feature_index, country, year):
    # try taking the value from the last 3 years
    for i in range(1, 4):
        past_year = year - i
        if (country, past_year) in data:
            past_data = data[(country, past_year)][1] # Access the list of features
            if feature_index < len(past_data) and str(past_data[feature_index])!='nan':
                return past_data[feature_index]

    # otherwise if missing last 3 years, linearly extrapulate from last decade
    recent_years_data = []
    for i in range(10,0,-1):
        past_year = year - i
        if (country, past_year) in data:
            past_data = data[(country, past_year)][1] # Access the list of features
            if feature_index < len(past_data) and not pd.isna(past_data[feature_index]):
                 recent_years_data.append((past_year, past_data[feature_index]))

    if len(recent_years_data) >= 2:
        years = [item[0] for item in recent_years_data]
        values = [item[1] for item in recent_years_data]
        # Linear extrapolation using linear regression with polyfit
        try:
            m, c = np.polyfit(years, values, 1)
            extrapolated_value = m * year + c
            return extrapolated_value
        except np.linalg.LinAlgError:
            # Handle cases where polyfit fails (e.g., all years are the same)
            return values[-1] # Return the last known value
    elif len(recent_years_data) == 1:
         # If only one data point in the last 10 years, use that value
         return recent_years_data[0][1]

    # If still missing after checking last 10 years, return NaN
    return np.nan

In [3]:
import pandas as pd
import numpy as np


df_features = calc_features_year(data_output,2023)

# Identify countries with and without NaN values
countries_with_nan = df_features[df_features.isnull().any(axis=1)].index.tolist()
countries_without_nan = df_features.dropna().index.tolist()
countries_with_one_nan = df_features[df_features.isnull().sum(axis=1) == 1].index.tolist()

# Print the results
print("Countries with NaN values in their features:")
print(countries_with_nan)
print("\nCountries without NaN values in their features:")
print(countries_without_nan)
print("\nCountries with exactly one NaN value in their features:")
print(countries_with_one_nan)

print(f"\nNumber of countries with NaN values: {len(countries_with_nan)}")
print(f"Number of countries without NaN values: {len(countries_without_nan)}")
print(f"Number of countries with exactly one NaN value: {len(countries_with_one_nan)}")

# Group countries by their single missing feature
missing_features_grouped = {}
if countries_with_one_nan:
    for country in countries_with_one_nan:
        missing_feature_name = df_features.loc[country].isnull().idxmax()
        if missing_feature_name not in missing_features_grouped:
            missing_features_grouped[missing_feature_name] = []
        missing_features_grouped[missing_feature_name].append(country)

    # Print countries grouped by missing feature
    print("\nMissing feature for countries with exactly one NaN:")
    for feature, countries in missing_features_grouped.items():
        print(f"  Missing feature is '{feature}':")
        print(f"    Countries: {', '.join(countries)}")

Countries with NaN values in their features:
['Afghanistan', 'Albania', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Armenia', 'Aruba', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Barbados', 'Bermuda', 'Bhutan', 'Bosnia and Herzegovina', 'British Virgin Islands', 'Brunei Darussalam', 'Cabo Verde', 'Cayman Islands', 'Central African Republic', 'Channel Islands', 'China', 'Congo, Rep.', 'Cuba', 'Curacao', 'Cyprus', 'Dominica', 'Dominican Republic', 'Equatorial Guinea', 'Eritrea', 'Faroe Islands', 'Fiji', 'French Polynesia', 'Gabon', 'Georgia', 'Gibraltar', 'Greenland', 'Grenada', 'Guam', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Hong Kong SAR, China', 'Iraq', 'Ireland', 'Isle of Man', 'Jordan', 'Kazakhstan', 'Kiribati', "Korea, Dem. People's Rep.", 'Kosovo', 'Kuwait', 'Kyrgyz Republic', 'Lebanon', 'Liberia', 'Libya', 'Liechtenstein', 'Macao SAR, China', 'Malawi', 'Malaysia', 'Maldives', 'Malta', 'Marshall Islands', 'Mauritius', 'Micronesia, Fed. Sts.', 'Moldova', 'Monaco', 'Mo

process 2023 data:
add some missing data, split to train and test and normalize

In [None]:
df_features = calc_features_year(data_output,2023)

# Fill in missing abortion rates
df_features.loc['Cyprus', 'abortions'] = 7
df_features.loc['Ireland', 'abortions'] = 6.7
df_features.loc['Malaysia', 'abortions'] = 11
df_features.loc['Mauritania', 'abortions'] = 42
df_features.loc['Malta', 'abortions'] = 3
df_features.loc['Morocco', 'abortions'] = 25

# Fill in missing female labor participation rates
df_features.loc['China', 'female labor participation'] = 59.6
df_features.loc['Gabon', 'female labor participation'] = 41.7
df_features.loc['Haiti', 'female labor participation'] = 60

# Fill in missing maternity leave index values
df_features.loc['Albania', 'maternity leave index'] = 23.23
df_features.loc['Bosnia and Herzegovina', 'maternity leave index'] = 52.14
df_features.loc['Kazakhstan', 'maternity leave index'] = 18
df_features.loc['Georgia', 'maternity leave index'] = 7.59
df_features.loc['Kyrgyz Republic', 'maternity leave index'] = 18
df_features.loc['Moldova', 'maternity leave index'] = 18
df_features.loc['Angola', 'maternity leave index'] = 13
df_features.loc['Armenia', 'maternity leave index'] = 20
df_features.loc['Azerbaijan', 'maternity leave index'] = 18
df_features.loc['Malawi', 'maternity leave index'] = 12.86
df_features.loc['Bhutan', 'maternity leave index'] = 8
df_features.loc['Tanzania', 'maternity leave index'] = 12
df_features.loc['Tajikistan', 'maternity leave index'] = 20
df_features.loc['North Macedonia', 'maternity leave index'] = 39
df_features.loc['Liberia', 'maternity leave index'] = 12.86
df_features.loc['Suriname', 'maternity leave index'] = 0
df_features.loc['Uzbekistan', 'maternity leave index'] = 18