In [None]:
import pandas as pd
type = "summary_50"

type_test_df_x = pd.read_parquet(f"df/{type}_test_df_x.gzip")

type_test_df_y = pd.read_parquet(f"df/{type}_test_df_y.gzip")

joined_test_df = pd.concat([type_test_df_x, type_test_df_y], axis=1).reset_index(drop=True)

#joined = pd.concat([joined_train_df, joined_test_df], axis=0).reset_index(drop=True)

In [None]:
type_test_df_x.drop("id", axis=1, inplace=True)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.load_model(f"models/xgb_{type}_softprob.json")

In [None]:
xgb.classes_

In [6]:
import numpy as np
import pandas as pd

def conditional_probability_predictor(model, df, conditions):
    """
    Filters the DataFrame based on a dictionary of column-value conditions,
    uses the model to predict probabilities on the filtered rows,
    and returns the average predicted probabilities.

    Args:
        model: Trained XGBoost model with predict_proba method.
        df (pd.DataFrame): The dataset to filter and predict on.
        conditions (dict): Dictionary of {column_name: value} to filter on.

    Returns:
        np.ndarray: Averaged predicted probabilities (P(Y=y | conditions))
    """
    # Apply all filtering conditions
    mask = pd.Series(True, index=df.index)
    for col, val in conditions.items():
        mask &= (df[col] == val)
    
    filtered_df = df[mask]
    
    if filtered_df.empty:
        raise ValueError("No rows match the given conditions.")
    
    # Predict probabilities
    probs = model.predict_proba(filtered_df)

    # Return average probabilities
    return np.round(np.mean(probs, axis=0),3)


In [None]:
p = conditional_probability_predictor(
    model=xgb,
    df=type_test_df_x,
    conditions={
        "topic Media Bias": True,
        "sentiment Media Bias": -1
    }
)
p

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def heatmap_conditional_probs(model, df, col1, col2, class_index=0, cmap='viridis', title=''):
    """
    Creates a heatmap of P(Y = class_index | col1 = v1, col2 = v2) for v1, v2 in {-1, 0, 1}.

    Args:
        model: Trained XGBoost model.
        df (pd.DataFrame): Dataset to filter and predict from.
        predictor_func (function): Function to compute conditional probabilities.
        col1 (str): First categorical variable (rows).
        col2 (str): Second categorical variable (columns).
        class_index (int): Class to visualize. Default = 0.
        cmap (str): Matplotlib colormap name.
    """
    values = [-1, 0, 1]
    heatmap_data = np.zeros((3, 3))  # rows = col1, cols = col2

    label_dict = {0 : "left", 1: "center", 2: "right"}

    for i, v1 in enumerate(values):
        for j, v2 in enumerate(values):
            try:
                topic1 = col1.replace("sentiment", "topic")
                topic2 = col2.replace("sentiment", "topic")
                conditions = {col1: v1, col2: v2, topic1: True, topic2: True}
                probs = conditional_probability_predictor(model, df, conditions)
                heatmap_data[i, j] = np.round(probs[class_index], 3)
            except ValueError:
                heatmap_data[i, j] = np.nan  # In case no data matches

    heatmap_df = pd.DataFrame(heatmap_data, index=values, columns=values)

    plt.figure(figsize=(6, 5))
    sns.heatmap(heatmap_df, annot=True, cmap=cmap, fmt=".3f", cbar=True)
    plt.title(f"P(Y = {label_dict[class_index]} | {col1}, {col2})")
    plt.xlabel(col2)
    plt.ylabel(col1)
    plt.tight_layout()
    plt.savefig(f"{title}_{class_index}.png")
    plt.show()


In [None]:
heatmap_conditional_probs(
    model=xgb,
    df=type_test_df_x,
    col1="sentiment Politics",
    col2="sentiment Donald Trump",
    class_index=0,
    title=f"{type}_p_vs_dt"
)
heatmap_conditional_probs(
    model=xgb,
    df=type_test_df_x,
    col1="sentiment Politics",
    col2="sentiment Donald Trump",
    class_index=1,
    title=f"{type}_p_vs_dt"
)
heatmap_conditional_probs(
    model=xgb,
    df=type_test_df_x,
    col1="sentiment Politics",
    col2="sentiment Donald Trump",
    class_index=2,
    title=f"{type}_p_vs_dt"
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def probs(model, df, col1):
    """

    Args:
        model: Trained XGBoost model.
        df (pd.DataFrame): Dataset to filter and predict from.
        predictor_func (function): Function to compute conditional probabilities.
        col1 (str): First categorical variable (rows).
        col2 (str): Second categorical variable (columns).
        class_index (int): Class to visualize. Default = 0.
        cmap (str): Matplotlib colormap name.
    """
    values = [-1, 0, 1]
    dct = {}

    for cls in [0, 1, 2]:
        for i, v1 in enumerate(values):
                dct[cls] = {}
                try:
                    topic1 = col1.replace("sentiment", "topic")
                    conditions = {col1: v1, topic1: True}
                    probs = conditional_probability_predictor(model, df, conditions)
                    dct[cls][i] = np.round(probs[class_index], 3)
                except ValueError:
                    dct[cls][i] = np.nan  # In case no data matches

    return dct

In [10]:
dct = probs(
    model = xgb,
    df = joined_test_df,
    col1 = "sentiment Politics"
)

In [11]:
dct

{0: {2: nan}, 1: {2: nan}, 2: {2: nan}}