In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import pearsonr

In [66]:
master = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/Data/MasterClean.csv")
decisions = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/Data/DecisionSandwiches.csv")
model_df = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/Data/ModelDF.csv")
slider_ings = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/Data/SliderIngs.csv",header=None)

In [67]:
ratings_cols = [f"Rating.{i}" for i in range(1,11)]
veg_ingredients = slider_ings.iloc[0].tolist()
non_veg_ingredients = slider_ings.iloc[1].tolist()

ingredient_ratings = []

In [68]:
for i, row in model_df.iterrows():
    pid = row["Participant"]
    is_veg = str(row['Vegetarian']).strip().lower() == 'y'
    ing_labels = veg_ingredients if is_veg else non_veg_ingredients

    for label, rating_col in zip(ing_labels,ratings_cols):
        ingredient_ratings.append({
            "participant":pid,
            "ingredient":str(label).strip().lower(),
            "rating":row[rating_col]
        })

ingredient_ratings_df = pd.DataFrame(ingredient_ratings)

In [69]:
sand1_cols = [f"s1{i}" for i in range(1,9)]
sand2_cols = [f"s2{i}" for i in range(1,9)]

#rename decision columns
decisions_clean = decisions[sand1_cols + sand2_cols].copy()
decisions_clean.columns = [f"s1_{i}" for i in range(1,9)] + [f"s2_{i}" for i in range(1,9)]


In [70]:
def avg_rating(ingredients,rating_lookup):
    ratings = []
    for ing in ingredients:
        ing = str(ing).strip().lower()
        val = rating_lookup.get(ing)
        if pd.notna(val):
            ratings.append(val)
    return np.mean(ratings) if ratings else np.nan

In [None]:
results = []

for i, row in master.iterrows():
    pid = row['participant']
    choice = row['Decision.keys']
    if choice not in ['a','l']:
        continue

    #determine if there was a sequence flip
    sequence = row['sequence'] if 'sequence' in row else 0
    trial_index = int(row['trials.thisIndex'])
    s1_ings = [decisions_clean.iloc[trial_index][f"s1_{j}"] for j in range(1, 9)]
    s2_ings = [decisions_clean.iloc[trial_index][f"s2_{j}"] for j in range(1,9)]

    participant_ratings = ingredient_ratings_df[ingredient_ratings_df['participant'] == pid]
    rating_dict = dict(zip(participant_ratings['ingredient'],participant_ratings['rating']))

    s1_avg = avg_rating(s1_ings,rating_dict) #calculate average rating based on own rating
    s2_avg = avg_rating(s2_ings,rating_dict)

    if sequence == 1:
        chosen = 1 if choice == 'l' else 2
    else:
        chosen = 1 if choice == 'a' else 2

    preferred = 1 if s1_avg > s2_avg else 2 if s2_avg > s1_avg else 0
    valid = (chosen == preferred) if preferred != 0 else np.nan 
    #if participant chose sandwich with higher average rating then trial was marked as valid
    #if equal to or lower, then marked as invalid

    results.append({
        "participant": pid,
        "trial": trial_index,
        "chosen":chosen,
        "preferred":preferred,
        "s1_avg":s1_avg,
        "s2_avg":s2_avg,
        "valid":valid
    })

results_df = pd.DataFrame(results)

summary = results_df.groupby('participant').agg(
    slider_validity=('valid','mean'), # no. of valid choices made / #valid trials made
    n_valid_trials=('valid','count')
).reset_index()

summary.to_csv("slider_validity_by_participant.csv", index=False)

In [72]:
output_directory = "/Users/unasantos/Documents/GitHub/SURP2025/Participant-VvsSelf"
os.makedirs(output_directory,exist_ok=True)

In [73]:

validity_df = pd.read_csv("slider_validity_by_participant.csv")

# makes sure Participant ID formats match
validity_df["participant"] = validity_df["participant"].astype(str)
model_df["Participant"] = model_df["Participant"].astype(str)

# calculate average ingredient rating
rating_cols = [f"Rating.{i}" for i in range(1, 11)]
model_df["self_rating_avg"] = model_df[rating_cols].mean(axis=1)

# Merge
merged = validity_df.merge(model_df[["Participant", "self_rating_avg"]],
                           left_on="participant", right_on="Participant")

# cleans data
merged_clean = merged.dropna(subset=["self_rating_avg", "slider_validity"])

r, p = pearsonr(merged_clean["self_rating_avg"], merged_clean["slider_validity"])


plt.figure(figsize=(8, 6))
plt.scatter(merged_clean["self_rating_avg"], merged_clean["slider_validity"], alpha=0.7)
plt.xlabel("Average Ingredient Rating (Self-Rating)")
plt.ylabel("Slider-Based Validity")
plt.title(f"Slider Validity vs Self-Rating\n(Pearson r = {r:.2f})")

fit = np.polyfit(merged_clean["self_rating_avg"], merged_clean["slider_validity"], 1)
x_vals = np.linspace(merged_clean["self_rating_avg"].min(), merged_clean["self_rating_avg"].max(), 100)
plt.plot(x_vals, np.polyval(fit, x_vals), color="red", linestyle="--", label="Trendline")

plt.grid(True)
plt.tight_layout()

filename = f"ValidityvsAvgSelfRating.png"
filepath = os.path.join(output_directory,filename)
plt.savefig(filepath,dpi=300,bbox_inches="tight")
plt.close()

