# Instruction difficulty

This is the code to compute the instruction difficulty term from the length-controlled GLM. Note that the instruction difficulty term is not a crucial feature in the GLM, in particular the properties of the GLM hold regardless of how the instruction difficulty feature is computed (even if random). Having a sensible instruction dificulty slightly increases the correlation with LMSYS. More detail in this [GH issue](https://github.com/tatsu-lab/alpaca_eval/issues/346)

In [1]:
cd ..

/Users/yanndubois/Desktop/GitHub/alpaca_eval


In [2]:
from dotenv import load_dotenv
load_dotenv(".env")

True

In [3]:
%matplotlib inline

from notebooks.notebook_helpers import *
from alpaca_eval.metrics.glm_winrate import make_dmatrix_for_model, fit_LogisticRegressionCV
sklearn.set_config(enable_metadata_routing=True)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
lb = pd.read_csv("src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv", index_col=0)#.query("index != 'gpt4_gamed'")

# remove all the models that have missing annotations
indices_to_drop = []
for i in lb.index:
    if not Path(f"results/{i}/weighted_alpaca_eval_gpt4_turbo/annotations.json").is_file():
        indices_to_drop.append(i)
lb = lb.drop(indices_to_drop)
print(f"dropped {len(indices_to_drop)} models")

lb_arena = make_lb_arena(lb)
print(f"We are comparing to {len(lb_arena)} Arena models")

process_gamed_models_(lb)

dropped 4 models
We are comparing to 37 Arena models


In [7]:
all_models = list(lb.index)
ordered_models = [BASELINE] + [m for m in all_models if m != BASELINE]

all_df_annotations = load_annotations(lb)
all_df_annotations = all_df_annotations.query("len_2 != 0")

## Joint fitting to compute instruction_difficulty

In [12]:
from patsy import build_design_matrices, dmatrix
# same function as in glm_winrate.py but you want to be able to use local cariables
def make_dmatrix_for_model(df_train, df_test, formula, col_y_true="preference"):
    df_XY_train = dmatrix(formula, df_train, return_type="dataframe")
    df_X_test = build_design_matrices([df_XY_train.design_info], df_test, return_type="dataframe")[0]
    df_XY_train[col_y_true] = df_train[col_y_true]  # adds the label
    return df_XY_train, df_X_test

In [13]:
def get_index_param_from_joint_optimization(df, df_lb, formula):
    curr_df_lb = df_lb.copy()
    df_input, df_input_lb = make_dmatrix_for_model(df, df_lb,formula=formula)
    df_input["preference"] = df["preference"]
    
    model = fit_LogisticRegressionCV(df_input, "preference", is_ytrue_proba=True, n_splits=5)
    curr_df_lb["preference"] = model.predict_proba(df_input_lb)[:,1]
    lb[formula] = curr_df_lb.groupby("generator_2")["preference"].mean()[lb.index] * 100
    metrics = report(lb, formula, is_return_metrics=True)
    
    params = pd.Series(index=model.feature_names_in_, data=model.coef_[0]).to_frame()
    idx_params = params.loc[[i for i in params.index if "C(index)" in i]]
    idx_params.index = [int(i.split("[T.")[-1].split("]")[0]) for i in idx_params.index]
    # add the missing index (because you use dummy encoding) => one index missing which has effectively weight 0
    missing_idcs = set(df["index"].unique()) - set(idx_params.index)
    assert len(missing_idcs) == 1
    missing_idx = missing_idcs.pop()
    idx_params.loc[0,:] = 0
    idx_params = idx_params.sort_index()
    idx_params.columns = ["param"]
    return idx_params, model, metrics

In [None]:
# C(generator_2, levels=ordered_models): np.tanh(rand_delta_len_std_only), means that each model will have its own weight for "np.tanh(rand_delta_len_std_only)"
# C(generator_2, levels=ordered_models) means that each model will have it's own bias term
# C(index) means that each example (instruction) will have it's own weight. This is what will become the instruction difficulty
joint_formula = f"C(generator_2, levels=ordered_models): np.tanh(rand_delta_len_std_only) + C(generator_2, levels=ordered_models) + C(index)  + not_gamed_baseline.astype(float) - 1"

# init data without instruction difficulty 
df_init, df_lb_init = make_data(all_df_annotations)

# compute join optimization to get instruction difficulty
instruction_difficulty, model, metrics = get_index_param_from_joint_optimization(df_init, df_lb_init, formula = joint_formula, is_print=False)


## Validate instruction_difficulty

Now let's run the optimization with the newly computed and fixed instruction difficulty. I.e. fit all parameters disjointly

In [41]:
formula=f"np.tanh(rand_delta_len_std_only) + instruction_difficulty + not_gamed_baseline.astype(float) - 1"

# make data with join instruciton difficulty
df, df_lb = make_data(all_df_annotations, instruction_difficulty=instruction_difficulty)

# run the disjoint optimization. That's the only thing that we wil have to actually run
metrics, models = disjoint_optimization_(lb, df, df_lb, formula=formula, regularize_to_baseline_lambda=0.2)
metrics



{'verbosity_gameability': 6.24064558743236,
 'conciseness_gameability': 14.433044187827408,
 'adversarial_rank_gain': 49.0,
 'adversarial_winrate_gain': 9.620112879799999,
 'corr_arena': 0.9745109730605707,
 'corr_len': 0.2513188315718244,
 'logloss': 0.2672176991151537,
 'mse': 0.049212584861330455,
 'r2': 0.42343866160090404,
 'corr': 0.6400694636182821,
 'acc': 0.8967795936782501}

In [42]:
instruction_difficulty = instruction_difficulty.squeeze().sort_index().rename(index="instruction_difficulty")
instruction_difficulty.index.name="index"

Now let's compare the results with the insturction_difficulty computed on fewer models in early 2024, i.e., the instruction_difficulty that we actually use in AlpacaEval LC

In [43]:
from huggingface_hub import hf_hub_download
out = hf_hub_download(repo_id="tatsu-lab/alpaca_eval", 
                filename="instruction_difficulty.csv",
                repo_type="dataset",
                force_download=True,
                cache_dir=constants.DEFAULT_CACHE_DIR)
               
hf_instruction_difficulty = pd.read_csv(out, index_col=0).squeeze()

In [44]:
# make data with join instruciton difficulty
df, df_lb = make_data(all_df_annotations, instruction_difficulty=hf_instruction_difficulty)

# run the disjoint optimization. That's the only thing that we wil have to actually run
metrics, models = disjoint_optimization_(lb, df, df_lb, formula=formula, regularize_to_baseline_lambda=0.2)
metrics



{'verbosity_gameability': 7.454594530315204,
 'conciseness_gameability': 15.959179311867668,
 'adversarial_rank_gain': 49.0,
 'adversarial_winrate_gain': 8.450426719471494,
 'corr_arena': 0.9746295267069284,
 'corr_len': 0.2731314094063601,
 'logloss': 0.2719086188374834,
 'mse': 0.051460503983874965,
 'r2': 0.41395421815105193,
 'corr': 0.6329363823423694,
 'acc': 0.891627534425641}

In [45]:
from scipy.stats import pearsonr, spearmanr
s = spearmanr(instruction_difficulty, hf_instruction_difficulty).statistic
r = pearsonr(instruction_difficulty, hf_instruction_difficulty).statistic
print(f"Spearman: {s:.3f}")
print(f"Pearson: {r:.3f}")

Spearman: 0.953
Pearson: 0.958


We see that, eventhough the insturction_difficulty is computed on a different set of models, the results with both sets of instruction_difficulty are very similar and both instruction_difficulty are highly correlated.

We see that despite being computed on a different set of models, the instruciton difficulties are highly correlated.