In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy.stats import linregress

Non-Vegetarian

In [33]:
model_df = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/Data/ModelDF.csv")
final_results_nonveg = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/D-vs-V/DV_Results_NonVeg.csv")
slider_df = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/Data/SliderIngs.csv",header=None)

In [34]:
output_directory = "validity_vs_rating_plots_nonveg"
os.makedirs(output_directory,exist_ok=True)

In [35]:
rated_nonveg_ingredients = [str(ing).strip() for ing in slider_df.iloc[1].dropna()]
print(rated_nonveg_ingredients)

rating_cols = [f"Rating.{i+1}" for i in range(len(rated_nonveg_ingredients))]


['Tomatoes', 'Mayonnaise', 'Turkey', 'Bacon', 'Chicken', 'Cheddar cheese', 'Swiss cheese', 'Lettuce', 'Provolone cheese', 'Spinach']


In [36]:
for ing, rating_col in zip(rated_nonveg_ingredients,rating_cols):
    v_col = f"V_{ing}"

    if v_col not in final_results.columns or rating_col not in model_df.columns:
        print(f"Skipping {ing} — missing column in data.")
        continue

    df = pd.DataFrame()
    df['Participant'] = final_results_nonveg['Participant']
    df['Validity'] = final_results_nonveg[v_col]
    df['SelfRating'] = model_df[rating_col]
    df = df.dropna()

    x = df["SelfRating"]
    y = df["Validity"]

    slope,intercept,r_value,p_value,std_error = linregress(x,y)
    regression_line = slope * x + intercept
    
    results = {
    "Ingredient": {ing},
    "Pearson Correlation": r_value,
    "P-Value": p_value,
    "StandardError": std_error}

    # Convert to DataFrame
    correlation_results_nonveg = pd.DataFrame([results])

    # Append or create a new CSV file
    correlation_results_nonveg.to_csv("correlation_results_nonveg.csv", mode="a", index=False, header=not pd.io.common.file_exists("correlation_results_nonveg.csv"))

    print(f"Pearson Correlation - {ing}:",r_value)
    print(f"P-Value - {ing}:",p_value)
    print(f"Standard Error:",std_error)

    #plot
    plt.figure(figsize=(8, 6))
    plt.plot(x, regression_line, color="black", linestyle="-", label=f"y = {slope:.2f}x + {intercept:.2f}\n$r$ = {r_value:.2f}, $p$ = {p_value:.3f}")
    plt.scatter(df["SelfRating"], df["Validity"], color='red', edgecolor='black', alpha=0.7)
    plt.xlabel(f'Self-Rated Preference for {ing}')
    plt.ylabel(f'Validity (V) for {ing}')
    plt.title(f'Validity vs Self-Rating: {ing}')
    plt.grid(True)
    plt.tight_layout()
    
    filename = f"ValidityvsSelf_NonVeg_{ing}.png"
    filepath = os.path.join(output_directory,filename)
    plt.savefig(filepath,dpi=300,bbox_inches="tight")
    plt.close()

Pearson Correlation - Tomatoes: -0.07334872433159517
P-Value - Tomatoes: 0.38060999904664783
Standard Error: 0.001658118120605734
Pearson Correlation - Mayonnaise: -0.04555723139363778
P-Value - Mayonnaise: 0.5916646129024216
Standard Error: 0.0020148289104322294
Pearson Correlation - Turkey: 0.012305050942140886
P-Value - Turkey: 0.8840310622548331
Standard Error: 0.0018959047297992767
Pearson Correlation - Bacon: -0.02768089647666655
P-Value - Bacon: 0.7472399129138939
Standard Error: 0.0014574164678948578
Pearson Correlation - Chicken: -0.0027516747225519974
P-Value - Chicken: 0.9740731337450654
Standard Error: 0.002651671448279216
Pearson Correlation - Cheddar cheese: -0.06732580760348023
P-Value - Cheddar cheese: 0.43099754348998387
Standard Error: 0.0016703839532726385
Pearson Correlation - Swiss cheese: 0.13030759290411648
P-Value - Swiss cheese: 0.12218487137658991
Standard Error: 0.0015601460864797803
Pearson Correlation - Lettuce: 0.017047318936265404
P-Value - Lettuce: 0.840

Vegetarian

In [37]:
output_directory = "validity_vs_rating_plots_veg"
os.makedirs(output_directory,exist_ok=True)

In [38]:
# Row 2 of SliderIngs.csv = vegetarian ingredients
veg_ingredients = [str(x).strip() for x in slider_df.iloc[2].dropna()]
rating_cols = [f"Rating.{i+1}" for i in range(len(veg_ingredients))]

final_results_veg = pd.read_csv("/Users/unasantos/Documents/GitHub/SURP2025/D-vs-V/DV_Results_Veg.csv")


In [39]:
for ing, rating_col in zip(veg_ingredients,rating_cols):
    v_col = f"V_{ing}"

    if v_col not in final_results_veg.columns or rating_col not in model_df.columns:
        print(f"Skipping {ing} — missing column in data.")
        continue

    df = pd.DataFrame()
    df['Participant'] = final_results_veg['Participant']
    df['Validity'] = final_results_veg[v_col]
    df['SelfRating'] = model_df[rating_col]
    df = df.dropna()

    x = df["SelfRating"]
    y = df["Validity"]

    slope,intercept,r_value,p_value,std_error = linregress(x,y)
    regression_line = slope * x + intercept

    print(f"Pearson Correlation - {ing}:",r_value)
    print(f"P-Value - {ing}:",p_value)
    print(f"Standard Error:",std_error)

    results = {
    "Ingredient": {ing},
    "Pearson Correlation": r_value,
    "P-Value": p_value,
    "StandardError": std_error}

    # Convert to DataFrame
    correlation_results_veg = pd.DataFrame([results])

    # Append or create a new CSV file
    correlation_results_veg.to_csv("correlation_results_veg.csv", mode="a", index=False, header=not pd.io.common.file_exists("correlation_results_veg.csv"))
    #plot
    plt.figure(figsize=(8, 6))
    plt.plot(x, regression_line, color="black", linestyle="-", label=f"y = {slope:.2f}x + {intercept:.2f}\n$r$ = {r_value:.2f}, $p$ = {p_value:.3f}")
    plt.scatter(df["SelfRating"], df["Validity"], color='green', edgecolor='black', alpha=0.7)
    plt.xlabel(f'Self-Rated Preference for {ing}')
    plt.ylabel(f'Validity (V) for {ing}')
    plt.title(f'Validity vs Self-Rating: {ing}')
    plt.xticks([1, 2, 3, 4, 5, 6, 7])
    plt.grid(True)
    plt.tight_layout()
    filename = f"ValidityvsSelf_Veg_{ing}.png"
    filepath = os.path.join(output_directory,filename)
    plt.savefig(filepath,dpi=300,bbox_inches="tight")
    plt.close()


Pearson Correlation - Tomatoes: 0.528220590574066
P-Value - Tomatoes: 0.0775016101533009
Standard Error: 0.005069037957534188
Pearson Correlation - Onions: 0.06938201444259852
P-Value - Onions: 0.8303463846304924
Standard Error: 0.007478279181109691
Pearson Correlation - Cucumbers: -0.3216337604513383
P-Value - Cucumbers: 0.3079807247937206
Standard Error: 0.01203813796876633
Pearson Correlation - Lettuce: 0.2568423853566744
P-Value - Lettuce: 0.4203240000041164
Standard Error: 0.004523498756270489
Pearson Correlation - Avocado: 0.44082561451711194
P-Value - Avocado: 0.15145888011774292
Standard Error: 0.012187941514765606
Pearson Correlation - Sprouts: 0.33676510262992165
P-Value - Sprouts: 0.2844463726311444
Standard Error: 0.0052148666880772665
Pearson Correlation - Provolone cheese: 0.037781832156935286
P-Value - Provolone cheese: 0.907198010630496
Standard Error: 0.004545576495256471
Pearson Correlation - Peppers: -0.07290616848733843
P-Value - Peppers: 0.8313075770083191
Standard