<a href="https://colab.research.google.com/github/tomik062/AI_Project/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

extract data from github and merge together

In [1]:
import requests
import os
import importlib.util

# Define the GitHub details
repo_owner = 'tomik062'
repo_name = 'AI_Project'
file_path = 'extract_data.py'
url = f'https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main/{file_path}'

# Directory to save the downloaded file
download_dir = 'data_extraction_code'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

local_file_path = os.path.join(download_dir, file_path)

# Download the file
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

with open(local_file_path, 'wb') as f:
    f.write(response.content)

print(f"Downloaded {file_path} to {local_file_path}")

# Import the function from the downloaded file
spec = importlib.util.spec_from_file_location("extract_data_module", local_file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

# Call the extract_data function and print the output
data_output = module.extract_data()

Downloaded extract_data.py to data_extraction_code/extract_data.py
Downloaded GNI_PPP_const_2021_dollars.csv to data/GNI_PPP_const_2021_dollars.csv
Downloaded Religious-Composition-percentages.csv to data/Religious-Composition-percentages.csv
Downloaded abortion-rates-by-country-2025.csv to data/abortion-rates-by-country-2025.csv
Downloaded age-at-first-marriage-by-country-2025.csv to data/age-at-first-marriage-by-country-2025.csv
Downloaded average-workweek-by-country-2025.csv to data/average-workweek-by-country-2025.csv
Downloaded female-labor-force-participation-rates-slopes.csv to data/female-labor-force-participation-rates-slopes.csv
Downloaded gender-inequality-index.xlsx to data/gender-inequality-index.xlsx
Downloaded happiness-cantril-ladder.csv to data/happiness-cantril-ladder.csv
Downloaded healthcare-expenditure-per-capita-ppp.csv to data/healthcare-expenditure-per-capita-ppp.csv
Downloaded high_education_female.xlsx to data/high_education_female.xlsx
Downloaded high_educati

In [36]:
import pandas as pd
import numpy as np
def calc_features_year(data,year):
    feature_map = {
        'urbanization': 0,
        'avg birth age': 1,
        'happiness': 2,
        'female labor participation': 3,
        'health expenditure': 4,
        'physicians per capita': 5,
        'GNI PPP': 6,
        'region': 7,
        'christians': 8,
        'muslims': 9,
        'no religion': 10,
        'buddhists': 11,
        'hindus': 12,
        'jews': 13,
        'other religion': 14,
        'education man': 15,
        'education women': 16,
        'hdi': 17,
        'gii': 18,
        'avg marriage age women': 19,
        'avg marriage age men': 20,
        'maternity leave index': 21,
        'work hours men': 22,
        'work hours women': 23,
        'abortions': 24,
        'social media users': 25
    }
    countries = sorted(list(set([key[0] for key in data.keys()])))
    df = pd.DataFrame(index=countries, columns=feature_map.keys())

    # Populate the DataFrame
    for country in countries:
        key = (country, year)
        if key in data:
            features = data[key][1]
            for feature_name, feature_index in feature_map.items():
                if feature_index < len(features):
                  df.loc[country, feature_name] = features[feature_index]
                  if str(features[feature_index]) =='nan':
                    df.loc[country, feature_name]=handle_missing_values(data,feature_index,country,year)
    return df

In [37]:
import numpy as np
import pandas as pd

def handle_missing_values(data, feature_index, country, year):
    # try taking the value from the last 3 years
    for i in range(1, 4):
        past_year = year - i
        if (country, past_year) in data:
            past_data = data[(country, past_year)][1] # Access the list of features
            if feature_index < len(past_data) and str(past_data[feature_index])!='nan':
                return past_data[feature_index]

    # otherwise if missing last 3 years, linearly extrapulate from last decade
    recent_years_data = []
    for i in range(10,0,-1):
        past_year = year - i
        if (country, past_year) in data:
            past_data = data[(country, past_year)][1] # Access the list of features
            if feature_index < len(past_data) and not pd.isna(past_data[feature_index]):
                 recent_years_data.append((past_year, past_data[feature_index]))

    if len(recent_years_data) >= 2:
        years = [item[0] for item in recent_years_data]
        values = [item[1] for item in recent_years_data]
        # Linear extrapolation using linear regression with polyfit
        try:
            m, c = np.polyfit(years, values, 1)
            extrapolated_value = m * year + c
            return extrapolated_value
        except np.linalg.LinAlgError:
            # Handle cases where polyfit fails (e.g., all years are the same)
            return values[-1] # Return the last known value
    elif len(recent_years_data) == 1:
         # If only one data point in the last 10 years, use that value
         return recent_years_data[0][1]

    # If still missing after checking last 10 years, return NaN
    return np.nan

In [40]:
import pandas as pd

# Set pandas option to display all rows
pd.set_option('display.max_rows', None)

df_features = calc_features_year(data_output,2023)
print(df_features)

# Reset pandas option to default
pd.reset_option('display.max_rows')

                               urbanization avg birth age happiness  \
Afghanistan                       29.149118        30.366     1.721   
Albania                           61.257164        29.132     5.304   
Algeria                            71.81974        31.453     5.364   
American Samoa                      95.0648        29.375       NaN   
Andorra                            95.00263        32.518       NaN   
Angola                             66.94085        28.589    3.2835   
Antigua and Barbuda                 37.2528        28.255       NaN   
Argentina                          94.48409        29.073     6.188   
Armenia                            70.10863        27.781     5.455   
Aruba                                49.899        28.824       NaN   
Australia                            91.178        31.391     7.057   
Austria                            71.84103        31.167     6.905   
Azerbaijan                         60.19972         25.93     4.893   
Bahama