In [9]:
import numpy as np
import pandas as pd
import joblib

# key python concepts that we used in this code include:
# try excpetions          LINK: https://docs.python.org/3/tutorial/errors.html
# f-strings               LINK: https://docs.python.org/3/tutorial/inputoutput.html
# list comprehension      LINK: https://www.w3schools.com/python/python_lists_comprehension.asp
# several string methods  LINK: https://www.w3schools.com/python/python_ref_string.asp
# joblib                  LINK: https://www.analyticsvidhya.com/blog/2023/02/how-to-save-and-load-machine-learning-models-in-python-using-joblib-library/

def final_function_to_predict():
    
    # While loop which asks for consent until a user exits or responds
    while True:
        consent = input("Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N)").strip().upper() # .upper() allows a user to type y or n / .strip() removes any extra spaces
        if consent in ["Y", "N"]:
            break # exit the while loop if input is valid
        else:
            print("Invalid input: Input should be either Y or N") # else tell the user the correct selection of inputs

    # list of valid regions. Necessary since one model doesn't use all regions but an input of Asia is still valid
    all_regions = [
        'Region_Asia',
        'Region_Central America and Caribbean',
        'Region_European Union',
        'Region_Middle East',
        'Region_North America',
        'Region_Oceania',
        'Region_Rest of Europe',
        'Region_South America'
    ]

    if consent == "Y": # if consent was given we use this optimised model
        print('Thank you, you have given consent for advanced population data.')

        model_columns = [ # we initialise a list that contains the features and dimensions that our model expects (e.g. including OHE column names)
            'const', 'Year', 'Under_five_deaths', 'Adult_mortality', 'BMI',
            'Incidents_HIV', 'GDP_per_capita', 'Schooling', 'Economy_status_Developing',
            'Region_Central America and Caribbean', 'Region_European Union',
            'Region_North America', 'Region_Oceania', 'Region_Rest of Europe',
            'Region_South America', 'log_GDP'
        ]
        
        input_features = [ # these are the columns that the user actually needs to input (we then transform this into the list above)
            'Region', 'Year', 'Under_five_deaths', 'Adult_mortality', 'BMI',
            'Incidents_HIV', 'GDP_per_capita', 'Schooling', 'Economy_status_Developing'
        ]

        row_data = {col: 0 for col in model_columns} # create a dictionary to store the features and the inputted values, set to 0 by default
        row_data['const'] = 1 # const is always 1
        # row_data looks like {'const': 1, 'Year': 0, ... }

        print("\nPlease enter the following values:\n")

        for feature in input_features: # loops through each feature that needs to be input for the model
            while True: # repeats the same feature request until a valid input is entered, signified by 'break' statement
                val = input(f"{feature}: ").strip() # for each feature, val variable is temporarily set to the input value and stripped of spaces
                
                # region requires more validation
                if feature == 'Region':
                    region_col = f"Region_{val}" # OHE region column names are Region_{name}
                    if region_col in all_regions: # checks if the input is valid regardless of if it's a column
                        if region_col in model_columns: # if yes, we check if it's a column we need
                            row_data[region_col] = 1 # if yes, we set that columns value to 1
                        break # regardless of if needed or not, we move on to the next feature
                    else:
                        print("Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' [case-sensitive]") # if the input isn't valid, provide the expected selection
                
                # currently year must be a positive number. we can easily set a different range       
                elif feature == 'Year':
                    try:
                        year = int(val) # try to change input to int, throw a value error if this isn't possible
                        if year >= 2000 and year <= 2100: # check that year is between 2000 and 2100
                            row_data['Year'] = year
                            break
                        else:
                            print("Invalid input: Year must be a value between 2000 and 2100 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Year must be an integer.") # we expect year to be a whole number

                elif feature == 'Under_five_deaths':
                    try:
                        ufd = float(val) # try to change input to float, throw a value error if this isn't possible
                        if ufd >= 0 and ufd <= 300: # check that under_five_deaths is in range.
                            row_data['Under_five_deaths'] = ufd
                            break
                        else:
                            print("Invalid input: Under_five_deaths must be a value between 0 and 300 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Under_five_deaths must be a number.") # we expect under_five_deaths to be a number

                elif feature == 'Adult_mortality':
                    try:
                        adu_mort = float(val) # try to change input to float, throw a value error if this isn't possible
                        if adu_mort >= 0 and adu_mort <= 800: # check that adult_mortality is in range
                            row_data['Adult_mortality'] = adu_mort
                            break
                        else:
                            print("Invalid input: Adult_mortality must be a value between 0 and 800 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Adult_mortality must be a number.") # we expect adult_mortality to be a number

                elif feature == 'BMI':
                    try:
                        bmi = float(val) # try to change input to float, throw a value error if this isn't possible
                        if bmi >= 10 and bmi <= 40: # check that BMI is in range
                            row_data['BMI'] = bmi
                            break
                        else:
                            print("Invalid input: BMI must be a value between 10 and 40 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: BMI must be a number.") # we expect BMI to be a number

                elif feature == 'Incidents_HIV':
                    try:
                        hiv = float(val) # try to change input to float, throw a value error if this isn't possible
                        if hiv >= 0 and hiv <= 50: # check that incidents_HIV is in range
                            row_data['Incidents_HIV'] = hiv
                            break
                        else:
                            print("Invalid input: Incidents_HIV must be a value between 0 and 50 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Incidents_HIV must be a number.") # we expect incidents_HIV to be a number

                elif feature == 'GDP_per_capita':
                    try:
                        gdp = int(val) # try to change input to int, throw a value error if this isn't possible
                        if gdp > 0 and gdp <= 250000: # check that GDP_per_capita is in range
                            row_data['GDP_per_capita'] = gdp
                            break
                        else:
                            print("Invalid input: GDP_per_capita must be a value between 0 and 250,000 (right-inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: GDP_per_capita must be an integer.") # we expect GDP_per_capita to be a whole number

                elif feature == 'Schooling':
                    try:
                        school = float(val) # try to change input to float, throw a value error if this isn't possible
                        if school >= 0 and school <= 20: # check that schooling is in range
                            row_data['Schooling'] = school
                            break
                        else:
                            print("Invalid input: Schooling must be a value between 0 and 20 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Schooling must be a number.") # we expect Schooling to be a number

                # developing is unique since it needs to be 0 or 1
                elif feature == 'Economy_status_Developing':
                    if val in ["0", "1"]: # check if input value is 0 or 1
                        row_data[feature] = int(val)
                        break
                    else:
                        print("Invalid input: Economy_status_Developing must be 0 or 1. [0 = Developed, 1 = Developing]") # tell the user the expected selection if input wasn't 0 or 1
        
        row_data['log_GDP'] = np.log(row_data['GDP_per_capita']) # create a new field with transformed GDP_per_capita

        user_df = pd.DataFrame([row_data]) # we turn the observation into a DataFrame

        scaler = joblib.load('opti_scaler.pkl') # this loads our optimised scaler, created by our Models notebook
        scale_cols = ['Schooling', 'Adult_mortality', 'Under_five_deaths', 'GDP_per_capita', 'Year', 'log_GDP', 'BMI', 'Incidents_HIV']

        def feature_eng(df, scaler, scale_cols):
            df = df.copy()
            df[scale_cols] = scaler.transform(df[scale_cols]) # we transfrom the data using the same scaling that our optimised model used to learn from the training set (StandardScaler)
            return df

        test = feature_eng(user_df, scaler, scale_cols) # we scale the relevant columns using our scaler

        model = joblib.load('opti_model.pkl') # we load our optimised model
        print(f'Predicted life-expectancy: {round(model.predict(test).iloc[0],2)}') # and predict the life expectancy using the input values

    elif consent == "N": # if consent wasn't given we use the limited model
        print('Thank you, you have NOT given consent - a minimalist model will be used which may be less accurate and robust.')

        model_columns = [ # we initialise a list that contains the features and dimensions that our model expects (including OHE column names)
            'const', 'Year', 'Region_Asia', 'Region_Central America and Caribbean', 'Region_European Union',
            'Region_Middle East', 'Region_North America', 'Region_Oceania', 'Region_Rest of Europe',
            'Region_South America', 'Under_five_deaths', 'Adult_mortality', 'GDP_per_capita',
            'Economy_status_Developing'
        ]

        input_features = [ # these are the limited selection of columns that the user actually needs to input (we then transform this into the list above)
            'Region', 'Year', 'Under_five_deaths', 'Adult_mortality','GDP_per_capita', 'Economy_status_Developing'
        ]

        row_data = {col: 0 for col in model_columns} # create a dictionary to store the features and the inputted values, set to 0 by default
        row_data['const'] = 1 # const is always 1
        # row_data looks like {'const': 1, 'Year': 0, ... }

        print("\nPlease enter the following values:\n")

        for feature in input_features: # loops through each feature that needs to be input for the model
            while True: # repeats the same feature request until a valid input is entered, signified by 'break' statement
                val = input(f"{feature}: ").strip() # for each feature, val variable is temporarily set to the input value and stripped of spaces
                
                # region requires more validation
                if feature == 'Region':
                    region_col = f"Region_{val}" # OHE region column names are Region_{name}
                    if region_col in all_regions: # checks if the input is valid regardless of if it's a column
                        if region_col in model_columns: # if yes, we check if it's a column we need
                            row_data[region_col] = 1 # if yes, we set that columns value to 1
                        break # regardless of if needed or not, we move on to the next feature
                    else:
                        print("Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' [case-sensitive]") # if the input isn't valid, provide the expected selection
                
                # currently year must be a positive number. we can easily set a different range       
                elif feature == 'Year':
                    try:
                        year = int(val) # try to change input to int, throw a value error if this isn't possible
                        if year >= 2000 and year <= 2100: # check that year is between 2000 and 2100
                            row_data['Year'] = year
                            break
                        else:
                            print("Invalid input: Year must be a value between 2000 and 2100 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Year must be an integer.") # we expect year to be a whole number

                elif feature == 'Under_five_deaths':
                    try:
                        ufd = float(val) # try to change input to float, throw a value error if this isn't possible
                        if ufd >= 0 and ufd <= 300: # check that under_five_deaths is in range.
                            row_data['Under_five_deaths'] = ufd
                            break
                        else:
                            print("Invalid input: Under_five_deaths must be a value between 0 and 300 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Under_five_deaths must be a number.") # we expect under_five_deaths to be a number

                elif feature == 'Adult_mortality':
                    try:
                        adu_mort = float(val) # try to change input to float, throw a value error if this isn't possible
                        if adu_mort >= 0 and adu_mort <= 800: # check that adult_mortality is in range
                            row_data['Adult_mortality'] = adu_mort
                            break
                        else:
                            print("Invalid input: Adult_mortality must be a value between 0 and 800 (inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: Adult_mortality must be a number.") # we expect adult_mortality to be a number

                elif feature == 'GDP_per_capita':
                    try:
                        gdp = int(val) # try to change input to int, throw a value error if this isn't possible
                        if gdp > 0 and gdp <= 250000: # check that GDP_per_capita is in range
                            row_data['GDP_per_capita'] = gdp
                            break
                        else:
                            print("Invalid input: GDP_per_capita must be a value between 0 and 250,000 (right-inclusive).") # if the input is a number but fails this check, tell the user the required range
                    except ValueError:
                        print("Invalid input: GDP_per_capita must be an integer.") # we expect GDP_per_capita to be a whole number

                # developing is unique since it needs to be 0 or 1
                elif feature == 'Economy_status_Developing':
                    if val in ["0", "1"]: # check if input value is 0 or 1
                        row_data[feature] = int(val)
                        break
                    else:
                        print("Invalid input: Economy_status_Developing must be 0 or 1. [0 = Developed, 1 = Developing]") # tell the user the expected selection if input wasn't 0 or 1

        user_df = pd.DataFrame([row_data]) # we turn the observation into a DataFrame

        scaler = joblib.load('limited_scaler.pkl') # this loads our limited scaler, created by our Models notebook
        scale_cols = [ 'Year', 'Under_five_deaths', 'Adult_mortality', 'GDP_per_capita', 'Economy_status_Developing']

        def feature_eng(df, scaler, scale_cols):
            df = df.copy()
            df[scale_cols] = scaler.transform(df[scale_cols]) # we transfrom the data using the same scaling that our limited model used to learn from the training set (MinMaxScaler)
            return df

        test = feature_eng(user_df, scaler, scale_cols) # we scale the relevant columns using our scaler

        model = joblib.load('limited_model.pkl') # we load our limited model
        print(model.predict(test).iloc[0]) # and predict the life expectancy using the input values


final_function_to_predict() # function call

Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N) N


Thank you, you have NOT given consent - a minimalist model will be used which may be less accurate and robust.

Please enter the following values:



Region:  gh


Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' [case-sensitive]


Region:  2


Invalid input: Valid regions include 'Asia', 'Central America and Caribbean', 'European Union', 'Middle East', 'North America', 'Oceania', 'Rest of Europe', 'South America' [case-sensitive]


Region:  Asia
Year:  -1


Invalid input: Year must be a value between 2000 and 2100 (inclusive).


Year:  2015
Under_five_deaths:  -1


Invalid input: Under_five_deaths must be a value between 0 and 300 (inclusive).


Under_five_deaths:  25
Adult_mortality:  -1


Invalid input: Adult_mortality must be a value between 0 and 800 (inclusive).


Adult_mortality:  56
GDP_per_capita:  0


Invalid input: GDP_per_capita must be a value between 0 and 250,000 (right-inclusive).


GDP_per_capita:  25000
Economy_status_Developing:  -1


Invalid input: Economy_status_Developing must be 0 or 1. [0 = Developed, 1 = Developing]


Economy_status_Developing:  0


79.45445299798092
