In [31]:
import pandas as pd
from views_pipeline_core.data.country import CountryData

country_data = CountryData()

In [5]:
def read_parquet_file(file_path):
    """
    Reads a Parquet file and returns it as a DataFrame.
    
    :param file_path: The path to the Parquet file.
    :return: A DataFrame containing the data from the Parquet file.
    """
    try:
        df = pd.read_parquet(file_path)
        df = df.reset_index()  # Reset the multi-index to columns
        return df
    except Exception as e:
        print(f"Error reading the Parquet file: {e}")
        return None

In [46]:
df = read_parquet_file("/Users/dylanpinheiro/Desktop/views-platform/views-models/models/brown_cheese/data/generated/predictions_forecasting_20241212_142900.parquet")
df

Unnamed: 0,month_id,country_id,step_combined
0,539,1,0.000000
1,539,2,0.000000
2,539,3,0.000000
3,539,4,0.018723
4,539,5,0.000000
...,...,...,...
6871,574,242,0.213831
6872,574,243,0.215016
6873,574,244,0.263984
6874,574,245,3.510562


In [47]:
# {"step_combined":0.007324081924454064,"month_id":539,"country_id":1}
# {"step_combined":0,"month_id":539,"country_id":2}
# {"step_combined":0,"month_id":539,"country_id":3}
# {"step_combined":0.012125287199087953,"month_id":539,"country_id":4}
# {"step_combined":0,"month_id":539,"country_id":5}
# {"step_combined":0,"month_id":539,"country_id":6}
# {"step_combined":0.07135003879869481,"month_id":539,"country_id":7}
# {"step_combined":0,"month_id":539,"country_id":8}
# {"step_combined":1.5805093350042942,"month_id":539,"country_id":9}
# {"step_combined":0,"month_id":539,"country_id":10}
# {"step_combined":0.13174231407278902,"month_id":539,"country_id":11}
# {"step_combined":0,"month_id":539,"country_id":12}
# {"step_combined":0.22349107750196215,"month_id":539,"country_id":13}
# {"step_combined":0,"month_id":539,"country_id":14}
# {"step_combined":0,"month_id":539,"country_id":16}
# {"step_combined":2.3009641053757517,"month_id":539,"country_id":17}

# get the top n countries with the highest average step_combined by month_id with the step_combined value
n = 5
top_n_countries = df.groupby("country_id")["step_combined"].mean().nlargest(n).index.tolist()
top_n_countries


[117, 79, 120, 220, 57]

In [50]:
country_data.get_country_by_id(79)

Country(country_id=79, name='Nigeria', capname='Abuja', caplong='7,533333', caplat='9,083333', gwcode=475, gwsyear=1961, gwsmonth=6, gwsday=1, gweyear=2050, gwemonth=12, gweday=30, isoname='Nigeria', isonum=566, isoab='NGA', month_start=0, month_end=852, centroidlong='8.105306414519813', centroidlat='9.593960123298968', in_africa=1, in_me=0)

In [60]:
import numpy as np

# get the step_combined values by month_id for a country_id
country_id = 117

step_combined_values = df[df["country_id"] == country_id].set_index("month_id")["step_combined"]
# convert logged values to normal values
step_combined_values = step_combined_values.apply(np.exp)

# replace NaN, inf, -inf with 0
step_combined_values.replace([np.inf, -np.inf], np.nan, inplace=True)
step_combined_values.fillna(0, inplace=True)

# print step_combined_values as a table with month_id and step_combined
step_combined_values = step_combined_values.reset_index().sort_values("month_id").to_dict(orient="records")

step_combined_values





[{'month_id': 539, 'step_combined': 3813.2765416992247},
 {'month_id': 540, 'step_combined': 3952.3379540848427},
 {'month_id': 541, 'step_combined': 1442.3578409285797},
 {'month_id': 542, 'step_combined': 2263.686100560614},
 {'month_id': 543, 'step_combined': 1521.157557487535},
 {'month_id': 544, 'step_combined': 2344.345258267352},
 {'month_id': 545, 'step_combined': 2699.8825075230034},
 {'month_id': 546, 'step_combined': 2712.516925077413},
 {'month_id': 547, 'step_combined': 2338.712006957096},
 {'month_id': 548, 'step_combined': 3556.1736205565994},
 {'month_id': 549, 'step_combined': 3220.2023697149643},
 {'month_id': 550, 'step_combined': 4303.571355569859},
 {'month_id': 551, 'step_combined': 2919.8491190277377},
 {'month_id': 552, 'step_combined': 4217.768186459883},
 {'month_id': 553, 'step_combined': 3591.2458615885957},
 {'month_id': 554, 'step_combined': 2803.3192619032266},
 {'month_id': 555, 'step_combined': 2029.3490054381564},
 {'month_id': 556, 'step_combined': 18

In [79]:
from ingester3.ViewsMonth import ViewsMonth

def postprocess_forecasts(df):
    # exit if country_id column is not present
    if "country_id" not in df.columns:
        raise ValueError("country_id column not found in the DataFrame. Only cm level models supported.")
    n = 5
    df = df.reset_index()
    top_n_countries = df.groupby("country_id")["step_combined"].mean().nlargest(n).index.tolist()
    print(top_n_countries)
    table_str = f"Forecasts for top {n} countries: "
    for country_id in top_n_countries:
        step_combined_values = df[df["country_id"] == country_id].set_index("month_id")["step_combined"]
        # convert logged values to normal values
        step_combined_values = step_combined_values.apply(np.exp)

        # replace NaN, inf, -inf with 0
        step_combined_values.replace([np.inf, -np.inf], np.nan, inplace=True)
        step_combined_values.fillna(0, inplace=True)

        # convert step_combined_values to int and round upwards
        step_combined_values = step_combined_values.apply(np.ceil).astype(int)

        # print step_combined_values as a table with month_id and step_combined
        step_combined_list = step_combined_values.reset_index().sort_values("month_id").to_dict(orient="records")
        table_str += f"\n\n{CountryData().get_country_by_id(country_id).name}:\n"
        table_str += "{:<20} {:<20} {:<30}\n".format("Month", "Year", "Forecasted fatalities")
        table_str += "-" * 70 + "\n"
        for month in step_combined_list:
            month_name = ViewsMonth(month["month_id"]).month
            year = ViewsMonth(month["month_id"]).year
            table_str += "{:<20} {:<20} {:<30}\n".format(month_name, year, month["step_combined"])
    return table_str
table = postprocess_forecasts(df)
print(table)

[117, 79, 120, 220, 57]
Forecasts for top 5 countries: 

Ukraine:
Month                Year                 Forecasted fatalities         
----------------------------------------------------------------------
11                   2024                 3814                          
12                   2024                 3953                          
1                    2025                 1443                          
2                    2025                 2264                          
3                    2025                 1522                          
4                    2025                 2345                          
5                    2025                 2700                          
6                    2025                 2713                          
7                    2025                 2339                          
8                    2025                 3557                          
9                    2025                 3221              