In [21]:
def is_deepseek_thinking_model(model_id: str) -> bool:
    return "reason" in model_id or ("r1" in model_id and "deepseek" in model_id)

In [22]:
def is_thinking_model(model_id: str) -> bool:
    is_google_thinking_model = "gemini" in model_id and "thinking" in model_id
    is_qwen_thinking_model = "qwen" in model_id and "qwq" in model_id
    return is_deepseek_thinking_model(model_id) \
        or is_anthropic_thinking_model(model_id) \
        or is_google_thinking_model \
        or is_qwen_thinking_model

In [23]:
def is_anthropic_thinking_model(model_id: str) -> bool:
    return "claude-3.7-sonnet" in model_id and "_" in model_id

In [24]:
if is_thinking_model(model_id):
    extra_body = {
        "include_reasoning": True,
        "reasoning": {},
        # "provider": {
        #     "allow_fallbacks": False,
        #     "order": [
        #         "Fireworks",
        #         "Together",
        #     ],
        # },
    }

    if is_anthropic_thinking_model(model_id):
        thinking_budget_tokens = get_budget_tokens(model_id)
        extra_body["reasoning"] = {
            "max_tokens": thinking_budget_tokens,
        }
        max_new_tokens = max_new_tokens + thinking_budget_tokens
        # Remove the budget tokens suffix and add the thinking suffix
        model_id = model_id.split("_")[0] + ":thinking"

    if "qwen" in model_id:
        # increase the max tokens by 4000
        max_new_tokens = max_new_tokens + 4000
else:
    extra_body = None

In [None]:
import yaml
with open('gemini-images-unfaithfulness_analysis.yaml','r+', encoding='utf-8') as f:
    data_images = yaml.safe_load(f)

In [None]:
unfaithful_metric_images_distribution = {
    0: 0,
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0,
    6: 0,
    7: 0,
    8: 0
}

In [None]:
for problem_name, problem_data in data_images['analysis_results'].items():
    # print(problem_data['steps'])
    # print(len(problem_data['steps'].items()))
    total_steps = len(problem_data['steps'].items())
    unfaithful_count_distribution = {
        0: 0,
        1: 0,
        2: 0,
        3: 0,
        4: 0,
        5: 0,
        6: 0,
        7: 0,
        8: 0
    }
    for step_id, step_data in problem_data['steps'].items():
        # print(step_data['unfaithfulness_analysis']['unfaithful_metric'])
        unfaithful_metric = step_data['unfaithfulness_analysis']['unfaithful_metric']
        unfaithful_count_distribution[unfaithful_metric] += 1
    for unfaithful_metric_number, unfaithful_metric_count in unfaithful_count_distribution.items():
        unfaithful_metric_images_distribution[unfaithful_metric_number] += unfaithful_metric_count/total_steps
    # print(unfaithful_metric_distribution)

In [None]:
with open('gemini-text-unfaithfulness_analysis.yaml','r+', encoding='utf-8') as f:
    data_texts = yaml.safe_load(f)

In [None]:
unfaithful_metric_texts_distribution = {
    0: 0,
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0,
    6: 0,
    7: 0,
    8: 0
}

In [None]:
for problem_name, problem_data in data_texts['analysis_results'].items():
    # print(problem_data['steps'])
    # print(len(problem_data['steps'].items()))
    total_steps = len(problem_data['steps'].items())
    unfaithful_count_distribution = {
        0: 0,
        1: 0,
        2: 0,
        3: 0,
        4: 0,
        5: 0,
        6: 0,
        7: 0,
        8: 0
    }
    for step_id, step_data in problem_data['steps'].items():
        # print(step_data['unfaithfulness_analysis']['unfaithful_metric'])
        unfaithful_metric = step_data['unfaithfulness_analysis']['unfaithful_metric']
        unfaithful_count_distribution[unfaithful_metric] += 1
    for unfaithful_metric_number, unfaithful_metric_count in unfaithful_count_distribution.items():
        unfaithful_metric_texts_distribution[unfaithful_metric_number] += unfaithful_metric_count/total_steps
    # print(unfaithful_metric_distribution)

In [None]:
import plotly.graph_objects as go

# Extract x and y values for both distributions
x_values = list(range(0, 9))  # [0, 1, 2, 3, 4, 5, 6, 7, 8]
y_images = [unfaithful_metric_images_distribution[i] for i in x_values]
y_texts = [unfaithful_metric_texts_distribution[i] for i in x_values]

# Format text labels
text_images = [f'{val:.2f}' if val > 0 else '0.00' for val in y_images]
text_texts = [f'{val:.2f}' if val > 0 else '0.00' for val in y_texts]

# Create the grouped bar chart
fig = go.Figure()

# Add Images bars
fig.add_trace(go.Bar(
    name='Images',
    x=x_values,
    y=y_images,
    text=text_images,
    textposition='outside',
    textfont=dict(size=10, color='rgb(50, 50, 50)'),
    marker_color='rgb(55, 83, 109)',  # Professional blue
    marker_line_color='rgb(8, 48, 107)',
    marker_line_width=1.5,
    opacity=0.8
))

# Add Texts bars
fig.add_trace(go.Bar(
    name='Texts',
    x=x_values,
    y=y_texts,
    text=text_texts,
    textposition='outside',
    textfont=dict(size=10, color='rgb(50, 50, 50)'),
    marker_color='rgb(158, 85, 156)',  # Complementary purple color
    marker_line_color='rgb(120, 50, 118)',
    marker_line_width=1.5,
    opacity=0.8
))

# Update layout for grouped bars
fig.update_layout(
    title={
        'text': 'Weighted count of problems vs Faithful Metric(Gemini 2.0 Flash Experimental Thinking)',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 16, 'color': 'rgb(50, 50, 50)'}
    },
    xaxis=dict(
        title='Faithful Metric',
        tickmode='linear',
        tick0=0,
        dtick=1,
        range=[-0.5, 8.5],
        title_font={'size': 14},
        tickfont={'size': 12}
    ),
    yaxis=dict(
        title='Weighed Count',
        title_font={'size': 14},
        tickfont={'size': 12}
    ),
    barmode='group',  # This creates the side-by-side grouped bars
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=900,
    height=550,
    margin=dict(l=80, r=40, t=100, b=80),
    font=dict(
        family="Arial, sans-serif",
        size=12,
        color="rgb(50, 50, 50)"
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        font=dict(size=12)
    )
)

# Update axes styling
fig.update_xaxes(
    showgrid=True, 
    gridwidth=1, 
    gridcolor='rgb(235, 235, 235)',
    showline=True,
    linewidth=1,
    linecolor='rgb(204, 204, 204)'
)

fig.update_yaxes(
    showgrid=True, 
    gridwidth=1, 
    gridcolor='rgb(235, 235, 235)',
    showline=True,
    linewidth=1,
    linecolor='rgb(204, 204, 204)'
)

# Show the plot
fig.show()
