In [2]:
import pandas as pd
import matplotlib.pyplot as plt


# Set font configuration for PDF compatibility
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['font.family'] = 'Helvetica'

COLOR_PALETTE = [
    "#B8DEFECC",
    "#EA6B1C",
    "#77C17A",
    "#D6C1F2",
    '#FFD9B3',
    '#B3D9F7',
    '#FFB3B3',
    '#D6E6B3',
]


# Prelim_tokenlen

In [None]:
data_df = pd.read_csv("revision_hvacdata_token_length.csv", index_col=0).T
select_cols = {
    "one year": "one year HVAC data\n(full database)",
    # "one month": "one month HVAC data",
    # "one day": "one day HVAC data",
    "imu 30 seconds": "100Hz, 52channels IMU data\nfor 30 seconds",
    "process mean": "Processed HVAC data\n(mean value)",
}
data_df = data_df[list(select_cols.keys())]
data_df.rename(columns=select_cols, inplace=True)

data_df

Unnamed: 0,one year HVAC data\n(full database),"100Hz, 52channels IMU data \nfor 30 seconds",Processed HVAC data\n(mean value)
token_length,11993771,528209,8


In [None]:
# Create bar graph with log y-axis
fig, ax = plt.subplots(figsize=(10, 6))

# Get token lengths
token_lengths = data_df.loc['token_length'].values
labels = data_df.columns.tolist()

# Calculate multiplication factors (using smallest value as 1)
min_value = token_lengths.min()
multipliers = token_lengths / min_value

# Create bar chart
x_pos = range(len(labels))
bars = ax.bar(x_pos, token_lengths, color=COLOR_PALETTE[:len(labels)], alpha=0.8, edgecolor='black', linewidth=1.5)

# Set log scale for y-axis
ax.set_yscale('log')

# Add horizontal line at 128k (GPT-4o max context length)
gpt4o_limit = 128000
ax.axhline(y=gpt4o_limit, color='red', linestyle='--', linewidth=2, label=f'GPT-4o max context length (128k)', zorder=10)

# Add multiplication factors on top of each bar
for i, (bar, mult) in enumerate(zip(bars, multipliers)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height * 1.3,
            f'{mult:.1f}x' if mult >= 1 else f'{mult:.2f}x',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

# Formatting
ax.set_xlabel('Data Type', fontsize=12, fontweight='bold')
ax.set_ylabel('Token Length (log scale)', fontsize=12, fontweight='bold')
ax.set_title('Token Length Comparison Across Different Data Types', fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x_pos)
ax.set_xticklabels(labels, fontsize=10)
ax.legend(fontsize=11, loc='upper right')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('token_length_comparison.pdf', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nMultiplication factors (relative to smallest value = 1):")
for label, mult in zip(labels, multipliers):
    print(f"{label}: {mult:.1f}x")