In [1]:
import numpy as np
import pandas as pd
import joblib

def final_function_to_predict():
    
    # Ask for consent until Y or N
    while True:
        consent = input("Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N)").strip().upper()
        if consent in ["Y", "N"]:
            break
        else:
            print("Invalid input: Input should be either Y or N")

    # Define full valid regions list
    all_regions = [
        'Region_Asia',
        'Region_Central America and Caribbean',
        'Region_European Union',
        'Region_Middle East',
        'Region_North America',
        'Region_Oceania',
        'Region_Rest of Europe',
        'Region_South America'
    ]

    if consent == "Y":
        print('Thank you, you have given consent for advanced population data.')

        model_columns = [
            'const', 'Year', 'Under_five_deaths', 'Adult_mortality', 'BMI',
            'Incidents_HIV', 'GDP_per_capita', 'Schooling', 'Economy_status_Developing',
            'Region_Central America and Caribbean', 'Region_European Union',
            'Region_North America', 'Region_Oceania', 'Region_Rest of Europe',
            'Region_South America', 'log_GDP'
        ]
        
        input_features = [
            'Region', 'Year', 'Under_five_deaths', 'Adult_mortality', 'BMI',
            'Incidents_HIV', 'GDP_per_capita', 'Schooling', 'Economy_status_Developing'
        ]

        row_data = {col: 0 for col in model_columns}
        row_data['const'] = 1

        print("\nPlease enter the following values:\n")

        for feature in input_features:
            while True:
                val = input(f"{feature}: ").strip()
                # Region requires more validation since some regions were removed but are still valid inputs
                if feature == 'Region':
                    region_col = f"Region_{val}"
                    if region_col in all_regions:
                        if region_col in model_columns:
                            row_data[region_col] = 1
                        break
                    else:
                        print(f"Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' ")
                
                # Year must be a positive number       
                elif feature == 'Year':
                    try:
                        year = int(val)
                        if year >= 0:
                            row_data['Year'] = year
                            break
                        else:
                            print("Invalid input: Year can't be a negative number.")
                    except ValueError:
                        print("Invalid input: Year must be a positive integer.")

                # Developing is either 0 or 1
                elif feature == 'Economy_status_Developing':
                    if val in ["0", "1"]:
                        row_data[feature] = int(val)
                        break
                    else:
                        print("Invalid input: Economy_status_Developing must be 0 or 1.")
                else:
                    try:
                        if float(val) >= 0:
                            row_data[feature] = float(val)
                            break
                        else:
                            print(f"Invalid input: {feature} must be a positive number.")
                    except ValueError:
                        print(f"Invalid input: {feature} must be a number.")

        if row_data['GDP_per_capita'] > 0:
            row_data['log_GDP'] = np.log(row_data['GDP_per_capita'])
        else:
            print("GDP_per_capita must be > 0 to compute log_GDP. Setting log_GDP = 0.")
            row_data['log_GDP'] = 0

        user_df = pd.DataFrame([row_data])

        scaler = joblib.load('opti_scaler.pkl')
        scale_cols = ['Schooling', 'Adult_mortality', 'Under_five_deaths', 'GDP_per_capita', 'Year', 'log_GDP', 'BMI', 'Incidents_HIV']

        def feature_eng(df, scaler, scale_cols):
            df = df.copy()
            df[scale_cols] = scaler.transform(df[scale_cols])
            return df

        test = feature_eng(user_df, scaler, scale_cols)

        model = joblib.load('opti_model.pkl')
        print(model.predict(test).iloc[0])

    elif consent == "N":
        print('Thank you, you have NOT given consent - a minimalist model will be used which may be less accurate and robust.')

        model_columns = [ 
            'const', 'Year', 'Region_Asia', 'Region_Central America and Caribbean', 'Region_European Union',
            'Region_Middle East', 'Region_North America', 'Region_Oceania', 'Region_Rest of Europe',
            'Region_South America', 'Under_five_deaths', 'Adult_mortality', 'GDP_per_capita',
            'Economy_status_Developing'
        ]

        input_features = [
            'Region', 'Year', 'Under_five_deaths', 'Adult_mortality','GDP_per_capita', 'Economy_status_Developing'
        ]

        row_data = {col: 0 for col in model_columns}
        row_data['const'] = 1

        print("\nPlease enter the following values:\n")

        for feature in input_features:
            while True:
                val = input(f"{feature}: ").strip()
                if feature == 'Region':
                    region_col = f"Region_{val}"
                    if region_col in all_regions:
                        if region_col in model_columns:
                            row_data[region_col] = 1
                        break
                    else:
                        print(f"Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' ")
                        
                elif feature == 'Year':
                    try:
                        year = int(val)
                        if year >= 0:
                            row_data['Year'] = year
                            break
                        else:
                            print("Invalid input: Year can't be negative.")
                    except ValueError:
                        print("Invalid input: Year must be a positive integer.")
                        
                elif feature == 'Economy_status_Developing':
                    if val in ["0", "1"]:
                        row_data[feature] = int(val)
                        break
                    else:
                        print("Invalid input: Economy_status_Developing must be 0 or 1.")
                else:
                    try:
                        if float(val) >= 0:
                            row_data[feature] = float(val)
                            break
                        else:
                            print(f"Invalid input: {feature} must be a positive number.")
                    except ValueError:
                        print(f"Invalid input: {feature} must be a number.")

        user_df = pd.DataFrame([row_data])

        scaler = joblib.load('limited_scaler.pkl')
        scale_cols = [ 'Year', 'Under_five_deaths', 'Adult_mortality', 'GDP_per_capita', 'Economy_status_Developing']

        def feature_eng(df, scaler, scale_cols):
            df = df.copy()
            df[scale_cols] = scaler.transform(df[scale_cols])
            return df

        test = feature_eng(user_df, scaler, scale_cols)

        model = joblib.load('limited_model.pkl')
        print(model.predict(test).iloc[0])


final_function_to_predict()

Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N) Y


Thank you, you have given consent for advanced population data.

Please enter the following values:



Region:  ea


Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' 


Region:  4


Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' 


Region:  Asia
Year:  -2015


Invalid input: Year can't be a negative number.


Year:  f


Invalid input: Year must be a positive integer.


Year:  2015
Under_five_deaths:  -3.3


Invalid input: Under_five_deaths must be a positive number.


Under_five_deaths:  g


Invalid input: Under_five_deaths must be a number.


Under_five_deaths:  3.3
Adult_mortality:  56.2
BMI:  g


Invalid input: BMI must be a number.


BMI:  -25


Invalid input: BMI must be a positive number.


BMI:  26
Incidents_HIV:  -0.09


Invalid input: Incidents_HIV must be a positive number.


Incidents_HIV:  f


Invalid input: Incidents_HIV must be a number.


Incidents_HIV:  0.09
GDP_per_capita:  27542
Schooling:  9.7
Economy_status_Developing:  g


Invalid input: Economy_status_Developing must be 0 or 1.


Economy_status_Developing:  4


Invalid input: Economy_status_Developing must be 0 or 1.


Economy_status_Developing:  0


80.6564075314059
