# Markowitz Pipeline

In [None]:
from global_import import *
%matplotlib inline

# Platform Initialization

In [None]:
project_tourism = dh.get_or_create_project("overtourism1")
endpoint_url="http://minio:9000"

s3 = boto3.resource('s3',
                    endpoint_url=endpoint_url)

bucket = s3.Bucket('datalake')


# Custom Parameters 

In [None]:
column_presences_2_hold = ["AREA_ID","TIME_BLOCK_ID",col_str_day_od,"PRESENCES"]
# Compute the total number of presences for each AREA_ID
hour_ids = [7,8,10,16,18] 
hour_id = hour_ids[0]
nation = None               
# NOTE: in aggregate_presences                                                                        # None means all nations    
col_total_presences_oct_no_hour = f"total_presences_oct_no_hour_{hour_id}"                          # total presences in October without filtering by hour
col_total_presences_tour_no_hour = f"total_presences_no_hour_{hour_id}"                             # total presences (touristic months) without filtering by hour
# NOTE: To insert inside loop in hours
col_total_presences_tour = f"total_presences_{hour_id}"                                             # total presences at hour_id
# NOTE: When starting Markowitzù
col_tot_diff_oct = f"total_diff_oct_{hour_id}"                                                      # total difference October                   
col_tot_diff_october_mean_0 = f"total_diff_october_mean_0_{hour_id}"                                # total difference October - mean 0   
col_tot_diff_october_mean_0_var_1 = f"total_diff_october_mean_0_var_1_{hour_id}"                    # total difference October - mean 0 - var 1  (For random matrix standardization)
str_column_cov = "cov"

In [None]:
loops = {
    "day": ["Monday","Tuesday"],
    "hour": [7,8],
    "user": ["a","b"],
    "weekday": ["is", "is_not"],
    }

# Figure out which iterables to use based on the case_pipeline name
active_dimensions = [key for key in loops if key in "day_hour"]

active_dimensions
import itertools
all_combinations = itertools.product(*[loops[k] for k in active_dimensions])
for combination in all_combinations:
    print(combination)
    kwargs = {}
    # Fill kwargs dynamically {}
    for dim, value in zip(active_dimensions, combination):
        kwargs[dim] = value
    print("kwargs:", kwargs)

# Extract days available

In [None]:
print("Extract list of files from bucket...")
list_files_od, list_files_presenze, list_str_dates_yyyymm = extract_filenames_and_date_from_bucket(bucket)
date_in_file_2_skip = {'projects/tourism/_origin/vodafone-aixpa/od-mask_202407.parquet':"2024-08-08",
                       'projects/tourism/_origin/vodafone-aixpa/od-mask_202408.parquet':"2024-07-23"}



# Presences October

In [None]:
# NOTE: Extract the null day
print("Initialize null day OD-presenze...")
df_presenze_null_days = extract_presences_vodafone_from_bucket(s3,list_files_presenze, 2)                                                                                                      # NOTE: download the file from the bucket

df_presenze_null_days = add_is_weekday_from_period_presenze_null_days(df_presenze_null_days, period_col="PERIOD_ID", is_weekday_col="is_weekday")

print("Compute the total number of presences for each AREA_ID...")
col_str_average_presences_null_day = "avg_presences_october_2024"
df_presenze_null_days = compute_presences_average(
                            df_presenze_null_days = df_presenze_null_days,
                            str_area_id_presenze = str_area_id_presenze,
                            str_presences_presenze = str_presences_presenze,
                            col_out_presenze = col_total_presences_oct_no_hour,
                            is_group_by_hour = False,
                            col_hour_id = str_time_block_id_presenze,
                            hour_id = hour_id,
                            is_nationality_markowitz_considered = False,
                            nationality_col = str_country_presenze,
                            nation = nation        
)


# Presences - Touristic Months

In [None]:
df_presenze = extract_presences_vodafone_from_bucket(s3, list_files_presenze, 0)


In [None]:

# First sum over all "COUNTRY" entries, then take the average
print("Stacking the presences data...")
stack_df_presenze = concat_presences(list_files_presenze, s3, col_str_day_od, col_period_id = "PERIOD_ID")
list_str_days = list(stack_df_presenze[col_str_day_od].unique())
stack_df_presenze = aggregate_presences(
                            df_presenze = stack_df_presenze,
                            col_str_day = col_str_day_od,
                            str_area_id_presenze = str_area_id_presenze,
                            str_presences_presenze = str_presences_presenze,
                            col_out_presenze = col_total_presences_tour_no_hour,
                            is_group_by_hour = False,
                            col_hour_id = str_time_block_id_presenze,
                            hour_id = hour_id,
                            is_nationality_markowitz_considered = is_nationality_markowitz_considered,
                            nationality_col = str_country_presenze,
                            nation = nation
                        )
stack_df_presenze = aggregate_presences(df = stack_df_presenze, 
                                        list_columns_groupby = list_columns_groupby, 
                                        str_col_trips_to_be_aggregated = str_col_trips_to_be_aggregated, 
                                        str_col_name_aggregated = str_col_name_aggregated,
                                        method_aggregation = "sum")



In [None]:
#list_days_concatenated, stack_df_presenze = extract_list_days_presences(list_files_presenze, s3, col_str_day_od)
stack_df_presenze_mean_var = compute_time_series_markowitz(
                                                        stack_df_presenze = stack_df_presenze,
                                                        df_presenze_null_days = df_presenze_null_days,
                                                        str_area_id_presenze = str_area_id_presenze,
                                                        col_total_presences_tour_no_hour = col_total_presences_tour_no_hour,
                                                        col_total_presences_oct_no_hour = col_total_presences_oct_no_hour,
                                                        col_tot_diff_oct = col_tot_diff_oct,
                                                        col_tot_diff_october_mean_0 = col_tot_diff_october_mean_0,
                                                        col_tot_diff_october_mean_0_var_1 = col_tot_diff_october_mean_0_var_1
                                                        )

In [None]:
cities_gdf = gpd.read_file(os.path.join(os.getcwd(),"Data","mavfa-fbk_AIxPA_tourism-delivery_2025.08.22-zoning","fbk-aixpa-turismo.shp"))
cities_gdf.join(stack_df_presenze_mean_var.to_pandas().set_index(str_area_id_presenze), on=str_area_id_presenze).explore(col_tot_diff_oct, cmap="OrRd")#.plot(col_tot_diff_oct, cmap="OrRd",legend=True)
#plt.savefig("total_diff_oct.png")


# Compute Correlation Matrix from Time-Series

In [None]:
correlation_df = compute_correlation_matrix_df_from_time_series(stack_df_presenze_mean_var = stack_df_presenze_mean_var,
                                                   str_area_id_presenze = str_area_id_presenze,
                                                   col_str_day_od = col_str_day_od,
                                                   col_tot_diff_october_mean_0_var_1 = col_tot_diff_october_mean_0_var_1,
                                                   str_column_cov = str_column_cov)    

# RMT Clean Matrix

# Complete Markowitz Pipeline

In [None]:

from global_import import *
%matplotlib inline
project_tourism = dh.get_or_create_project("overtourism1")
endpoint_url="http://minio:9000"

s3 = boto3.resource('s3',
                    endpoint_url=endpoint_url)

bucket = s3.Bucket('datalake')

cities_gdf = gpd.read_file(os.path.join(os.getcwd(),"Data","mavfa-fbk_AIxPA_tourism-delivery_2025.08.22-zoning","fbk-aixpa-turismo.shp"))

print("Extract list of files from bucket...")
list_files_od, list_files_presenze, list_str_dates_yyyymm = extract_filenames_and_date_from_bucket(bucket)
date_in_file_2_skip = {'projects/tourism/_origin/vodafone-aixpa/od-mask_202407.parquet':"2024-08-08",
                    'projects/tourism/_origin/vodafone-aixpa/od-mask_202408.parquet':"2024-07-23"}
# NOTE: Extract the null day
print("Initialize null day OD-presenze...")
df_presenze_null_days = extract_presences_vodafone_from_bucket(s3 = s3,
                                                            list_files_presenze = list_files_presenze, 
                                                            i = 2)                                                                                                      # NOTE: download the file from the bucket
df_presenze_null_days = add_is_weekday_from_period_presenze_null_days(df = df_presenze_null_days, 
                                                                      period_col= str_period_id_presenze, 
                                                                      is_weekday_col= col_str_is_week)


# NOTE: Extract the stack of presences -> the overtouristic dataframe for presences
print("Stacking the presences data...")
stack_df_presenze_original = concat_presences(list_files_presences = list_files_presenze, 
                                    s3 = s3, 
                                    col_str_day_od = col_str_day_od, 
                                    col_period_id = str_period_id_presenze)

# NOTE: Add holiday column
stack_df_presenze_original = add_holiday_columun_df_presenze(stack_df_presenze = stack_df_presenze_original, 
                                                    col_str_day_od = col_str_day_od,
                                                    public_holidays = public_holidays,
                                                    col_str_is_week = col_str_is_week)

is_covariance_standardized = False
for is_weekday in week_days:
    cities_gdf = gpd.read_file(os.path.join(os.getcwd(),"Data","mavfa-fbk_AIxPA_tourism-delivery_2025.08.22-zoning","fbk-aixpa-turismo.shp"))
    columns_portfolio = []
    for hour_id in hour_ids:
        # NOTE: Filter by weekday / holiday -> Doing every hour since compute average takes away all the columns (it is logically inconsistent)
        stack_df_presenze_week_day = stack_df_presenze_original.filter(pl.col(col_str_is_week) == is_weekday)    
        df_presenze_null_days_week_day = df_presenze_null_days.filter(pl.col(col_str_is_week) == is_weekday)
        # NOTE: in aggregate_presences                                                                        # None means all nations    
        col_total_presences_oct_no_hour = f"total_presences_oct_no_hour_{hour_id}"                          # total presences in October without filtering by hour
        col_total_presences_tour_no_hour = f"total_presences_no_hour_{hour_id}"                             # total presences (touristic months) without filtering by hour
        # NOTE: To insert inside loop in hours
        col_total_presences_tour = f"total_presences_{hour_id}"                                             # total presences at hour_id
        # NOTE: When starting Markowitz
        col_tot_diff_oct = f"total_diff_oct_{hour_id}"                                                      # total difference October                   
        col_tot_diff_october_mean_0 = f"total_diff_october_mean_0_{hour_id}"                                # total difference October - mean 0   
        col_tot_diff_october_mean_0_var_1 = f"total_diff_october_mean_0_var_1_{hour_id}"                    # total difference October - mean 0 - var 1  (For random matrix standardization)
        str_column_cov = f"cov_{hour_id}"
        str_col_portfolio = f"portfolio_{hour_id}"
        # NOTE: Add the colum for the portfolio associated to the hour
        columns_portfolio.append(str_col_portfolio)
        col_tot_diff_october_var_1 = f"total_diff_october_var_1_{hour_id}"
        col_expected_return = f"expected_return_{hour_id}"
        col_std = f"std_day_{hour_id}"



        ########################################################
        ############### NULL DAY INITIALIZATION ################
        ########################################################

        print("Compute the total number of presences for each AREA_ID...")
        df_presenze_null_days_week_day = compute_presences_average(
                                                        df_presenze_null_days = df_presenze_null_days_week_day,
                                                        str_area_id_presenze = str_area_id_presenze,
                                                        str_presences_presenze = str_presences_presenze,
                                                        col_out_presenze = col_total_presences_oct_no_hour,
                                                        is_group_by_hour = False,
                                                        col_hour_id = str_time_block_id_presenze,
                                                        hour_id = hour_id,
                                                        is_nationality_markowitz_considered = False,
                                                        nationality_col = str_country_presenze,
                                                        nation = nation        
                                                        )


        #############################################################
        ############ PREPROCESS RAW DATA TO MARKOWITZ ###############
        ############################################################# 
        # NOTE: Length time
        list_str_days = list(stack_df_presenze_original[col_str_day_od].unique())
        # NOTE: Aggregate by nation and different groups -> This defines the the dataframe that associate a count to each t in T. NOTE that without this you have more groups of presences for each day
        stack_df_presenze_week_day = aggregate_presences(
                                                        df_presenze = stack_df_presenze_week_day,
                                                        col_str_day = col_str_day_od,
                                                        str_area_id_presenze = str_area_id_presenze,
                                                        str_presences_presenze = str_presences_presenze,
                                                        col_out_presenze = col_total_presences_tour_no_hour,
                                                        is_group_by_hour = True,
                                                        col_hour_id = str_time_block_id_presenze,
                                                        hour_id = hour_id,
                                                        is_nationality_markowitz_considered = is_nationality_markowitz_considered,
                                                        nationality_col = str_country_presenze,
                                                        nation = nation
                                                    )
        # NOTE: Compute correlation matrix X^T X (Wishart) in df format 
        if is_covariance_standardized:
            column_return = col_tot_diff_oct +"_over_std"    
        else:
            column_return = col_tot_diff_oct

        # NOTE: Compute the normalized covariance -> It is not for testing but chooses the return from the expectation
        stack_df_presenze = compute_starting_risk_column_from_stack_df(df_presenze_null_days = df_presenze_null_days_week_day,
                                                    stack_df_presenze = stack_df_presenze_week_day,
                                                    str_area_id_presenze = str_area_id_presenze,
                                                    col_total_presences_tour_no_hour = col_total_presences_tour_no_hour,
                                                    col_total_presences_oct_no_hour = col_total_presences_oct_no_hour,
                                                    col_return = column_return
                                                    )        
        # NOTE: Compute the expected return -> It is not for testing but chooses the return from the expectation
        df_mean = compute_expected_return_from_stack_df(stack_df_presenze = stack_df_presenze,
                                                        col_return = column_return,                                      # NOTE: This is expected i markowitz to be: col_tot_diff_oct
                                                        col_expected_return = col_expected_return,
                                                        str_area_id_presenze = str_area_id_presenze,
                                                        is_return_standardized = is_covariance_standardized,
                                                        col_std = col_std
                                                        )
        # NOTE: We define here the expected return: -> other approaches could be inserted here.
        expected_return = df_mean[col_expected_return].to_numpy()
        # NOTE: Standardize the return time series
        stack_df_presenze_mean_var = standardize_return_stack_df(stack_df_presenze = stack_df_presenze,
                                                                df_mean = df_mean,
                                                                col_return = column_return,
                                                                str_area_id_presenze = str_area_id_presenze,
                                                                is_standardize_return = is_covariance_standardized,
                                                                col_std = col_std)

        correlation_df = compute_correlation_matrix_df_from_time_series(stack_df_presenze_mean_var = stack_df_presenze_mean_var,
                                                                        str_area_id_presenze = str_area_id_presenze,
                                                                        col_str_day_od = col_str_day_od,
                                                                        col_return = column_return,
                                                                        str_column_cov = str_column_cov)    
        # NOTE: Extract area_to_index and index_to_area
        area_to_index, index_to_area = get_area_id_to_idx_mapping(cov_df = correlation_df, 
                                                                str_area_id_presenze = str_area_id_presenze)

        ##############################################################
        ####################### RMT Clean Matrix #####################
        ##############################################################

        # NOTE: Compute q = T/N
        q = from_areas_and_times_to_q(area_to_index = area_to_index,
                                    list_str_days = list_str_days)

        
        # NOTE: Transform covariance DataFrame into numpy matrix and create area mapping
        cov_matrix_numpy = from_df_correlation_to_numpy_matrix(cov_df = correlation_df, 
                                                            str_area_id_presenze = str_area_id_presenze, 
                                                            str_column_cov = str_column_cov, 
                                                            area_to_index = area_to_index)
        # NOTE: Clean the correlation matrix using RMT
        C_clean, eigvals_clean, eigvecs = rmt_clean_correlation_matrix(C = cov_matrix_numpy, 
                                                                       q = q,
                                                                       is_bulk_mean = True)

        
        # NOTE: Compute MP limits and mask of significant eigenvalues
        if is_covariance_standardized:
            sigma = None
        else:
            sigma = np.mean(df_mean[col_std].to_numpy())
        lambda_minus, lambda_plus, mask_eigvals = compute_MP_limits_and_mask(eigvals_clean, 
                                                                            q, 
                                                                            is_covariance_standardized= is_covariance_standardized, 
                                                                            sigma = sigma)        
                                                                            
#        plot_pastur(eigvals_clean)

        ##############################################################
        #################### Markowitz procedure #####################
        ##############################################################
        # NOTE: Extract portfolio weights from significant eigenpairs
        portfolio_weights = extract_portfolio_from_eigenpairs(C_clean = C_clean, 
                                                              eigvals_clean = eigvals_clean, 
                                                              eigvecs = eigvecs, 
                                                              expected_return = expected_return, 
                                                              sum_w = 1,
                                                              is_normalize_portfolio=True)
        # NOTE: Map portfolio weights to cities_gdf and plot -> compute the portoflio 
        cities_gdf = map_portfolio_numpy_to_cities_gdf(cities_gdf = cities_gdf,
                                        portfolio_weights = portfolio_weights,
                                        index_to_area = index_to_area,
                                        str_area_id_presenze = str_area_id_presenze,
                                        str_col_portfolio = str_col_portfolio)
        cities_gdf = cities_gdf.merge(df_mean.to_pandas(),on=str_area_id_presenze)
        fig,ax = plt.subplots(1,3, figsize = (10,10))
        plot_polygons_and_with_scalar_field(cities_gdf,str_col_portfolio,ax[0],fig,title = f"portfolio {hour_id} {is_weekday}")        
        plot_polygons_and_with_scalar_field(cities_gdf,col_expected_return,ax[1],fig,title = "<oct - day> ")
        plot_polygons_and_with_scalar_field(cities_gdf,col_std,ax[2],fig,title = "standard deviation day")
        plt.show(fig)
        plt.close(fig)
    # NOTE: Plot portfolio map
    path_base_portfolio = os.path.join(os.getcwd(),"Output",f"{is_weekday}")
    path_save_portfolio = os.path.join(path_base_portfolio,f"portfolio_map_{hour_id}.html")
    os.makedirs(path_base_portfolio,exist_ok=True)
    plot_portforlio_map_multiple_layers(cities_gdf = cities_gdf,
                                    str_area_id_presenze = str_area_id_presenze,
                                    columns_to_plot = columns_portfolio,
                                    str_col_comuni_name = str_col_comuni_name,
                                    save_path = path_save_portfolio)
    cities_gdf.to_file(os.path.join(path_base_portfolio,"goedataframe_input_plots_markowitz.geojson"))
    informative_text_output = "Explicit description variables needed for plot: " + f"\nstr_area_id_presenze = {str_area_id_presenze}\n columns_to_plot: "
    for col in columns_portfolio:
        informative_text_output += col +", "
    informative_text_output += f"\nstr_col_comuni_name: {str_col_comuni_name}"
    with open(os.path.join(path_base_portfolio,"output_variable_description.txt"), "w") as f:
        f.write(informative_text_output)

In [None]:
# write a snippet that allows me to show three dimensional relation between str_col_portfolio, col_expected_return, col_std in cities_gdf. assume I have already computed these columns in cities_gdf. use matplotlib to create a 3d scatter plot with str_col_portfolio on the x axis, col_expected_return on the y axis and col_std on the z axis. color the points based on col_std using a colormap. add axis labels and a colorbar to indicate the mapping of colors to col_std values. show the plot. 
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(cities_gdf[col_expected_return], cities_gdf[col_std],cities_gdf[str_col_portfolio], c=cities_gdf[str_col_portfolio], cmap='viridis')
ax.set_xlabel(col_std)
ax.set_ylabel(col_expected_return)
ax.set_zlabel(str_col_portfolio)
plt.colorbar(sc, label=str_col_portfolio)
plt.show(fig)
plt.close(fig)
fig1,ax1 = plt.subplots(1,1,figsize=(6,6))
ax1.scatter(cities_gdf[col_expected_return], cities_gdf[str_col_portfolio])
ax1.set_xlabel(col_expected_return)
ax1.set_ylabel(str_col_portfolio)
plt.show(fig1)
plt.close(fig1)



In [None]:
cities_gdf.merge(stack_df_presenze_week_day.to_pandas(), on = "AREA_ID").plot("total_presences_no_hour_18", cmap="OrRd")


In [None]:
for i,eigvect in enumerate(eigvecs):
    if eigvals_clean[i] > lambda_plus:
        plt.hist(eigvect,label=f"eigenvector {i}")
plt.legend()


# Check Gaussianity Distribution for low eigenvectors

In [None]:
from scipy.optimize import curve_fit
mus = []
sigmas = []
for i,eigvect in enumerate(eigvecs):
    if eigvals_clean[i] < lambda_minus:
        n, bins = np.histogram(eigvect, bins=30, density=True)
        try:
            popt, pcov = curve_fit(lambda x, a, b, c : a * np.exp(-(x-b)**2/c), bins[:-1], n)
            plt.plot(bins[:-1], popt[0] * np.exp(-(bins[:-1]-popt[1])**2/popt[2]), label=f"fit {i}")
            plt.hist(eigvect, bins=30, density=True, alpha=0.5)
            plt.legend()
            sigmas.append(popt[2])
            mus.append(popt[1])
            print(f"Eigenvector {i}: mu = {popt[1]:.4f}, sigma = {np.sqrt(popt[2]):.4f}")
        except:
            pass
fig,ax = plt.subplots(1,1,figsize=(6,6))
ax.scatter(mus, sigmas, 'o')
# --- IGNORE ---


# Markowitz from Presences

In [None]:
target_return_markowitz = 1

markowitz_portfolio(df = stack_df_presenze_mean_var, 
                    col_id = str_area_id_presenze,
                    col_date = col_str_day_od,
                    col_price = None,
                    col_return = col_tot_diff_october_mean_0_var_1,
                    col_portfolio_weight = "portfolio_weight",
                    target_return = target_return_markowitz)


In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
q = 142/61
lambda_plus = (1 + np.sqrt(1/q))**2
lambda_minus = (1 - np.sqrt(1/q))**2


def marchenko_pastur_pdf(x, q, sigma=1.0):
    """Marchenko-Pastur probability density function."""
    lambda_plus = sigma**2 * (1 + np.sqrt(1/q))**2
    lambda_minus = sigma**2 * (1 - np.sqrt(1/q))**2
    if lambda_minus < x < lambda_plus:
        return (1 / (2 * np.pi * q * sigma**2 * x)) * np.sqrt((lambda_plus - x) * (x - lambda_minus))
    else:
        return 0
def plot_pastur(eigvals_clean):
    fig1,ax1 = plt.subplots(1,1,figsize=(6,6))
    ax1.hist(eigvals_clean, bins=50, density=True, alpha=0.5, label="Eigenvalues Clean", color="orange")
    #ax1.hist(np.linalg.eigvals(cov_matrix_numpy), bins=50, density=True, alpha=0.5, label="Eigenvalues Original", color="blue")
    x = np.linspace(0.01, 15, 1000)
    y = [marchenko_pastur_pdf(xi, q) for xi in x]
    ax1.plot(x, y, label="Marchenko-Pastur PDF", color="green")
    ax1.axvline(x=lambda_plus, color='red', linestyle='--', label=r'$\lambda_{+}$')
    ax1.axvline(x=lambda_minus, color='green', linestyle='--', label=r'$\lambda_{-}$')
    plt.title("Eigenvalues Distribution")
    plt.legend()


fig,ax = plt.subplots(1,2,figsize=(10,10))
#sns.heatmap(cov_matrix_numpy, ax=ax[0])
sns.heatmap(C_clean, ax=ax[1])

fig2,ax2 = plt.subplots(1,1,figsize=(6,6))
ax2.hist(portfolio_weights, bins=20, density=True, alpha=0.5, label="Portfolio Weights", color="blue")
plt.title("Portfolio Weights")



In [None]:
len(portfolio_weights)

# Merge GeoDf and Plot

In [None]:
cities_gdf = gpd.read_file(os.path.join(os.getcwd(),"Data","mavfa-fbk_AIxPA_tourism-delivery_2025.08.22-zoning","fbk-aixpa-turismo.shp"))
cities_gdf


In [None]:
# Map portfolio weights onto cities_gdf using parameterized area id column
# Assumes `index_to_area` is a dict[int, area_id] and `portfolio_weights` is array-like aligned by index
# `str_area_id_presenze` must match the area identifier column in cities_gdf (e.g., "AREA_ID").
# Avoid SettingWithCopyWarning by operating on an explicit copy and using .loc assignment.

import geopandas as gpd

if not isinstance(cities_gdf, gpd.GeoDataFrame):
    raise TypeError("cities_gdf must be a GeoDataFrame")

# Work on a fresh copy to ensure we are not mutating a view/slice
cities_gdf = cities_gdf.copy()

# Build mapping from area_id -> weight (recompute to ensure availability)
area_to_weight = {area: float(portfolio_weights[idx]) for idx, area in index_to_area.items()}

# Assign the portfolio column by mapping the area identifier using .loc
mapped = cities_gdf[str_area_id_presenze].map(area_to_weight)
cities_gdf.loc[:, "portfolio"] = mapped

# Optional: report any areas missing a weight (not present in index_to_area)
_missing = cities_gdf["portfolio"].isna()
n_missing = int(_missing.sum())
if n_missing:
    print(
        f"Warning: {n_missing} areas in cities_gdf had no portfolio weight mapping.",
        "Examples:",
        cities_gdf.loc[_missing, str_area_id_presenze].head().tolist(),
    )
    # If desired, uncomment to set missing weights to 0.0 instead of NaN
    # cities_gdf.loc[:, "portfolio"] = cities_gdf["portfolio"].fillna(0.0)
fig,ax = plt.subplots(1,1,figsize=(10,10))
cities_gdf.plot(ax = ax, column="portfolio", cmap="OrRd",legend=True)
plt.savefig("portfolio_map.png")

In [None]:
# Plot/rendering setup for VS Code notebooks
# - Force matplotlib inline backend
# - Improve retina rendering
# - Provide display() for HTML maps (folium/geopandas.explore)
from IPython import get_ipython
ip = get_ipython()
if ip is not None:
    ip.run_line_magic("matplotlib", "inline")
    try:
        from matplotlib_inline.backend_inline import set_matplotlib_formats
        set_matplotlib_formats("retina")
    except Exception:
        pass

import matplotlib.pyplot as plt
from IPython.display import display

In [None]:
# Explicitly display the interactive map (folium under the hood)
_map = cities_gdf.explore(column="portfolio", cmap="viridis", tooltip=str_area_id_presenze)
display(_map)

# Show Portfolio Map

In [None]:
# Matplotlib heatmap
import seaborn as sns
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cov_matrix_numpy, cmap='viridis', cbar=True, square=True, annot=False, fmt='.1f', ax=ax)
ax.set_title('Covariance Matrix Heatmap')
ax.set_xlabel('Area Index')
ax.set_ylabel('Area Index')
plt.tight_layout()
plt.show()

In [None]:
str_col_risk_presences = f"risk_{user_profile}_t_{str_t}_{str_t1}"   
str_col_return_presences = f"return_{user_profile}_t_{str_t}_{str_t1}"
str_col_corr_dist_matrix = f"corr_{user_profile}_t_{str_t}_{str_t1}"                                                                                                  # column name for the correlation with the distance matrix


In [None]:

# NOTE: Link together data from cities_gdf and corresponding index to df_presenze, to have the AREA_CODE column and presenze
df_presenze_unique = df_presenze_pd.drop_duplicates(subset=[str_area_id_presenze], keep='first')
cities_gdf = cities_gdf.merge(df_presenze_unique, left_on=str_area_id_presenze,right_on=str_area_id_presenze,how="left")
map_idx_cities_gdf_2_area_code = dict(zip(cities_gdf.index, cities_gdf[str_area_id_presenze]))  # create a map from the index of the cities gdf to the area code
# NOTE: Add column indices from AREA_CODE that is the one characteristic of df_presenze, df_od, but not to the case of generated  by gravity flows
df_distance_matrix = add_column_area_code_OD_df_distance(df_distance_matrix,
                                                        map_idx_cities_gdf_2_area_code,
                                                        str_col_origin=str_col_origin,
                                                        str_col_destination=str_col_destination,
                                                        str_area_code_origin_col=str_area_code_origin_col,
                                                        str_area_code_destination_col=str_area_code_destination_col
                                                        )  # add the area code to the origin and destination columns of the distance matrix
# NOTE: Time intervals of interest
list_time_intervals = [[7,8],[8,9],[10,11],[16,17],[18,19],[25,26]]                                                                                                                                # NOTE: time intervals of interest in hours   
# NOTE: Time intervals
for time_interval in list_time_intervals:
    print("Time interval: ", time_interval)                                                                                                                               # for each time interval
    int_hour_start_window_interest = time_interval[0]                                                                                                                   # start time window of interest
    int_hour_end_window_interest = time_interval[1]                                                                                                                     # end time window of interest
    int_min_aggregation_OD = 60                                                                                                                                         # aggregation time in minutes
    # NOTE: Profile user
    # NOTE: This is used to filter the buses
    if int_hour_start_window_interest != 25:                                                                                                                   # for each user profile                                   
        # NOTE: chnge the time according to the source                                                                                                                  
        time_vector_OD = pd.timedelta_range(start = f"{int_hour_start_window_interest}h",
                                            end = f"{int_hour_end_window_interest}h",
                                            freq = f"{int_min_aggregation_OD}min")
        str_time_vector = np.linspace(int_hour_start_window_interest,int_hour_end_window_interest,len(time_vector_OD))                                                  # time vector in hours
        is_fluxes_hourly = True                                                                                                                         # NOTE: if the fluxes are generated hourly
    else:
        is_fluxes_hourly = False                                                                                                                         # NOTE: if the fluxes are generated hourly
    print(f"User profile: {user_profile} - Time vector: {time_vector_OD}")                                                                                          # print the user profile and the time vector
    for t_idx in range(len(time_vector_OD)-1):                                                                                                                      # NOTE: t_start_OD is a list of time in HH:MM:SS format
        # Initialize variables that are relevant for the time analysis (time and user profile)
        str_t = str_time_vector[t_idx]                                                                                                                              # time in HH:MM:SS format
        str_t1 = str_time_vector[t_idx + 1]                                                                                                                         # time in HH:MM:SS format
        t_i = time_vector_OD[t_idx]                                                                                                                                 # time in hours
        t_i1 = time_vector_OD[t_idx+1]                                                                                                                              # time in hours
        print(f"Time: {str_t} - {str_t1}")                                                                                                                          # print the time
        # Parameters - df_presences
        str_col_risk_presences = f"risk_{user_profile}_t_{str_t}_{str_t1}"   
        str_col_return_presences = f"return_{user_profile}_t_{str_t}_{str_t1}"
        str_col_corr_dist_matrix = f"corr_{user_profile}_t_{str_t}_{str_t1}"                                                                                                  # column name for the correlation with the distance matrix
        str_dir_output_date = os.path.join(config[str_dir_output],"Diffusion 3")                                                                      # NOTE: create a directory for the date
        Path(str_dir_output_date).mkdir(parents=True, exist_ok=True)                                                                                                # create the directory if it does not exist                
        # TODO:                                                                                 # extract the day from the file name
