<a href="https://colab.research.google.com/github/xw3065-xpw/Lecture7-Xinping-Wang/blob/main/Edited_python_week1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python Session 1

## We will practice cleaning some Food choice task data

We are going to generate data from 20 individuals to practice our skills. In the task, participants rate 50 foods for healthiness, tastiness and choice. We are simulating this data below.



In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
blocks = ['health', 'taste', 'choice']
blocks

['health', 'taste', 'choice']

In [None]:
round(random.uniform(0.5, 4.0),2)

3.83

In [None]:
# Define blocks and trial structure
blocks = ['health', 'taste', 'choice']
trials_per_block = 75
participants = range(1, 21)

# Generate 50 unique foods
base_foods = [
    'apple', 'banana', 'burger', 'carrot', 'donut', 'eggs', 'fries', 'grapes', 'ice cream', 'kale',
    'pizza', 'yogurt', 'spinach', 'steak', 'candy', 'popcorn', 'mango', 'nuts', 'cheese', 'chicken',
    'broccoli', 'chocolate', 'granola', 'lettuce', 'pasta', 'salmon', 'tofu', 'soda', 'rice', 'beans',
    'cucumber', 'peach', 'bacon', 'cereal', 'toast', 'avocado', 'beef', 'peanut butter', 'cake', 'milk',
    'watermelon', 'pear', 'turkey', 'onion rings', 'oatmeal', 'cranberries', 'syrup', 'waffles', 'cookie', 'shrimp'
]
assert len(base_foods) == 50

# Assign fat and sugar levels randomly
food_properties = {}
for food in base_foods:
    fat = random.choices(['high', 'low'], weights=[0.4, 0.6])[0]
    sugar = random.choices(['high', 'low'], weights=[0.5, 0.5])[0]
    food_properties[food] = {'fat': fat, 'sugar': sugar}

# Generate trials
all_trials = []

for participant in participants:
    for block in blocks:
        for trial_num in range(1, trials_per_block + 1):
            food = random.choice(base_foods)
            rt_missing = random.random() < 0.02  # 2% chance of missing RT
            reaction_time = None if rt_missing else round(random.uniform(0.5, 4.0), 2)
            rating = None if reaction_time is None else random.randint(1, 10)

            fat = food_properties[food]['fat']
            sugar = food_properties[food]['sugar']

            trial = {
                'participant': participant,
                'block': block,
                'trial_number': trial_num,
                'food': food,
                'reaction_time': reaction_time,
                'rating': rating,
                'fat': fat,
                'sugar': sugar
            }
            all_trials.append(trial)

# Create DataFrame
df = pd.DataFrame(all_trials)

# Validate logic: rating is only missing if RT is missing
assert all(df[df['rating'].isna()]['reaction_time'].isna())

The data are stored in a dataframe object, which we have called df
To access items in the dataframe, we need to type "df"

In [None]:
#If we want to see the data, we can just type
df

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,avocado,0.85,2.0,high,high
1,1,health,2,beef,2.34,1.0,high,low
2,1,health,3,kale,1.60,5.0,low,high
3,1,health,4,candy,1.13,6.0,low,high
4,1,health,5,shrimp,3.80,9.0,high,high
...,...,...,...,...,...,...,...,...
4495,20,choice,71,banana,2.32,9.0,low,low
4496,20,choice,72,banana,,,low,low
4497,20,choice,73,yogurt,,,low,low
4498,20,choice,74,chicken,1.75,3.0,high,high


In [None]:
# To see anything in df we will need to reference df first
df.columns

Index(['participant', 'block', 'trial_number', 'food', 'reaction_time',
       'rating', 'fat', 'sugar'],
      dtype='object')

In [None]:
df['reaction_time']

Unnamed: 0,reaction_time
0,850.0
1,2340.0
2,1600.0
3,1130.0
4,3800.0
...,...
4495,2320.0
4496,
4497,
4498,1750.0


In [None]:
# We can also look at the values of columns
# All of these will access the food column
df.food
df['food']
df.iloc[0,3]


'avocado'

In [None]:
# Try here with RT

In [None]:
len(df)

4500

In [None]:
len(df[df.reaction_time.isna()==True])

87

In [None]:
# To analyze this data, we will first need to remove any missing trials
# let's find the missing values
df[df.reaction_time.isna()==False]
df.reaction_time.dropna()

Unnamed: 0,reaction_time
0,850.0
1,2340.0
2,1600.0
3,1130.0
4,3800.0
...,...
4493,2290.0
4494,2860.0
4495,2320.0
4498,1750.0


In [None]:
df[df.reaction_time < 2000]

# What would we change to see RTs < 2 only?

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,avocado,850.0,2.0,high,high
2,1,health,3,kale,1600.0,5.0,low,high
3,1,health,4,candy,1130.0,6.0,low,high
9,1,health,10,peanut butter,1660.0,9.0,low,high
13,1,health,14,tofu,540.0,7.0,low,low
...,...,...,...,...,...,...,...,...
4483,20,choice,59,fries,1920.0,1.0,low,low
4485,20,choice,61,pasta,1010.0,7.0,high,high
4487,20,choice,63,avocado,1730.0,8.0,high,high
4498,20,choice,74,chicken,1750.0,3.0,high,high


In [None]:
# make a new data frame with no missing values
df1 = df[df.reaction_time.isna()==True]

In [None]:
# Now we want to perform some calculations on this data-set
# let's start by summarizing, for one person the health rating

# Filter for participant 1 and the 'health' block
participant_id = 1
health_block = df[(df['participant'] == participant_id) & (df['block'] == 'health')]

# Remove missing ratings (i.e., where RT was missing)
valid_ratings = health_block['rating'].dropna()

# Calculate the average health rating
average_health_rating = valid_ratings.mean()

print(f"Participant {participant_id}'s average health rating: {average_health_rating:.2f}")


Participant 1's average health rating: 5.76


In [None]:
health_block

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,avocado,850.0,2.0,high,high
1,1,health,2,beef,2340.0,1.0,high,low
2,1,health,3,kale,1600.0,5.0,low,high
3,1,health,4,candy,1130.0,6.0,low,high
4,1,health,5,shrimp,3800.0,9.0,high,high
...,...,...,...,...,...,...,...,...
70,1,health,71,cranberries,710.0,1.0,high,high
71,1,health,72,beef,900.0,1.0,high,low
72,1,health,73,waffles,750.0,7.0,high,low
73,1,health,74,yogurt,1580.0,2.0,low,low


In [None]:
#Try for health only for low and high-fat
for participant_id in range(1, 21):
  for x in ['low','high']:
    high_block = df[(df['participant'] == participant_id) & (df['block'] == 'health') & (df['fat'] == x)]

    valid_ratings = high_block['rating'].dropna()

# Calculate the average health rating
    average_health_rating = valid_ratings.mean()
    print(f"Participant {participant_id}'s average health rating: {average_health_rating:.2f} for {x} foods")




Participant 1's average health rating: 5.79 for low foods
Participant 1's average health rating: 5.67 for high foods
Participant 2's average health rating: 5.70 for low foods
Participant 2's average health rating: 5.50 for high foods
Participant 3's average health rating: 5.32 for low foods
Participant 3's average health rating: 5.75 for high foods
Participant 4's average health rating: 5.93 for low foods
Participant 4's average health rating: 5.22 for high foods
Participant 5's average health rating: 5.62 for low foods
Participant 5's average health rating: 5.59 for high foods
Participant 6's average health rating: 5.47 for low foods
Participant 6's average health rating: 5.62 for high foods
Participant 7's average health rating: 5.04 for low foods
Participant 7's average health rating: 6.06 for high foods
Participant 8's average health rating: 6.30 for low foods
Participant 8's average health rating: 5.41 for high foods
Participant 9's average health rating: 5.04 for low foods
Partic

In [None]:
## now try for loop





In [None]:
#Now let's create a new dataframe and store each persons average RT and rating for high and low fat foods

# Group by participant, block, and fat level
summary_df = (
    df
    .dropna(subset=['rating', 'reaction_time'])  # Exclude trials with missing values
    .groupby(['participant', 'block', 'fat'])
    .agg(
        average_rating=('rating', 'mean'),
        average_reaction_time=('reaction_time', 'mean'),
        trial_count=('rating', 'count')  # Optional: to see how many valid trials per group
    )
    .reset_index()
)

print(summary_df.head())


   participant   block   fat  average_rating  average_reaction_time  \
0            1  choice  high        5.840000            2472.400000   
1            1  choice   low        5.780000            2309.000000   
2            1  health  high        5.666667            1996.190476   
3            1  health   low        5.792453            2410.943396   
4            1   taste  high        5.884615            2208.076923   

   trial_count  
0           25  
1           50  
2           21  
3           53  
4           26  


In [None]:
# Pivot to wide format
wide_df = summary_df.pivot_table(
    index='participant',
    columns=['block', 'fat'],
    values=['average_rating', 'average_reaction_time']
)




In [None]:
wide_df.columns

MultiIndex([(       'average_rating', 'choice', 'high'),
            (       'average_rating', 'choice',  'low'),
            (       'average_rating', 'health', 'high'),
            (       'average_rating', 'health',  'low'),
            (       'average_rating',  'taste', 'high'),
            (       'average_rating',  'taste',  'low'),
            ('average_reaction_time', 'choice', 'high'),
            ('average_reaction_time', 'choice',  'low'),
            ('average_reaction_time', 'health', 'high'),
            ('average_reaction_time', 'health',  'low'),
            ('average_reaction_time',  'taste', 'high'),
            ('average_reaction_time',  'taste',  'low')],
           names=[None, 'block', 'fat'])

In [None]:
# Step 3: Flatten column names
wide_df.columns = [f'{stat}_{block}_{fat}' for stat, block, fat in wide_df.columns]
wide_df = wide_df.reset_index()



   participant  average_rating_choice_high  average_rating_choice_low  \
0            1                    5.840000                   5.780000   
1            2                    4.111111                   5.345455   
2            3                    5.277778                   5.196429   
3            4                    3.727273                   5.492063   
4            5                    6.105263                   5.296296   

   average_rating_health_high  average_rating_health_low  \
0                    5.666667                   5.792453   
1                    5.500000                   5.696429   
2                    5.750000                   5.322034   
3                    5.222222                   5.927273   
4                    5.588235                   5.625000   

   average_rating_taste_high  average_rating_taste_low  \
0                   5.884615                  6.088889   
1                   6.608696                  5.313725   
2                   4.8571

In [None]:
wide_df.columns

Index(['participant', 'average_rating_choice_high',
       'average_rating_choice_low', 'average_rating_health_high',
       'average_rating_health_low', 'average_rating_taste_high',
       'average_rating_taste_low', 'average_reaction_time_choice_high',
       'average_reaction_time_choice_low', 'average_reaction_time_health_high',
       'average_reaction_time_health_low', 'average_reaction_time_taste_high',
       'average_reaction_time_taste_low'],
      dtype='object')

In [None]:
# Here try and simulate a different dataset - a monetary choice task where the participant
# selects between an immediate vs delayed reward. Compare the RT between when the participant
# chooses the immediate vs delayed option

In [None]:
# navigate to the directory
data=pd.read_csv("https://raw.githubusercontent.com/CaitlinLloyd/Psychology_Programming2025/refs/heads/main/Data/DelayDisc_example.csv")

In [None]:
data

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant
0,17.009420,4.48,,23.21,131,10.99,51,1
1,28.013655,3.16,1.0,16.43,32,9.99,19,1
2,43.017407,4.14,1.0,38.44,33,32.02,12,1
3,56.021820,4.47,1.0,38.66,100,26.57,24,1
4,71.024792,3.63,1.0,29.54,142,27.76,6,1
...,...,...,...,...,...,...,...,...
115,344.112863,5.42,1.0,30.54,38,37.25,132,2
116,357.117812,4.18,2.0,23.07,82,29.27,140,2
117,368.121754,3.32,1.0,33.18,165,27.61,104,2
118,383.125799,6.00,1.0,6.15,51,14.76,177,2


In [None]:
data['delayed_opt']= "none"
data

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant,delayed_opt
0,17.009420,4.48,,23.21,131,10.99,51,1,none
1,28.013655,3.16,1.0,16.43,32,9.99,19,1,none
2,43.017407,4.14,1.0,38.44,33,32.02,12,1,none
3,56.021820,4.47,1.0,38.66,100,26.57,24,1,none
4,71.024792,3.63,1.0,29.54,142,27.76,6,1,none
...,...,...,...,...,...,...,...,...,...
115,344.112863,5.42,1.0,30.54,38,37.25,132,2,none
116,357.117812,4.18,2.0,23.07,82,29.27,140,2,none
117,368.121754,3.32,1.0,33.18,165,27.61,104,2,none
118,383.125799,6.00,1.0,6.15,51,14.76,177,2,none


In [None]:
# figure out whether left or right column is delayed (1 is left, 2 is right)
data['delayed_opt']= "none"
data.loc[data['delay_left'] < data['delay_right'],'delayed_opt'] =2
data.loc[data['delay_left'] > data['delay_right'],'delayed_opt'] =1

In [None]:
# figure out whether left or right column is delayed (1 is left, 2 is right)
data['delayed_opt_chose']= "none"
data.loc[data['delayed_opt'] == data['choice'],'delayed_opt_chose'] =1
data.loc[data['delayed_opt'] != data['choice'],'delayed_opt_chose'] =0

In [None]:
data[(data['participant'] == 1) & (data['delayed_opt_chose'] == 0)]

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant,delayed_opt,delayed_opt_chose
0,17.00942,4.48,,23.21,131,10.99,51,1,1,0
9,134.04283,5.86,2.0,19.62,150,19.25,27,1,1,0
10,145.047487,4.71,1.0,12.26,37,13.64,75,1,2,0
12,173.055158,5.7,1.0,13.91,11,14.19,94,1,2,0
13,188.061036,3.27,2.0,13.72,66,13.51,9,1,1,0
15,218.071854,3.52,2.0,16.31,132,12.65,21,1,1,0
16,231.077068,4.48,1.0,21.56,16,27.53,154,1,2,0
20,283.094411,4.96,1.0,19.91,84,20.26,159,1,2,0
21,298.098435,5.22,1.0,10.25,42,10.4,178,1,2,0
22,309.101121,4.67,1.0,21.87,89,23.26,168,1,2,0


In [None]:
# Now summarize the RT for each person when they chose delayed vs chose sooner reward

#Try for health only for low and high-fat
for participant in range(1, 3):
  for x in [1,0]:

    high_block = data[(data['participant'] == participant) & (data['delayed_opt_chose'] == x)]

    valid_ratings = high_block['rt'].dropna()


# Calculate the average health rating
    average_RT = valid_ratings.mean()
    print(f"Participant {participant}'s average RT: {average_RT:.2f} for {x} trials")




Participant 1's average RT: 4.42 for 1 trials
Participant 1's average RT: 4.25 for 0 trials
Participant 2's average RT: 4.66 for 1 trials
Participant 2's average RT: 4.91 for 0 trials


In [None]:
## Here calculate the average earnings per person and the number of times they chose delayed vs sooner
data.loc[data['choice'] == 1, 'earnings'] = data.loc[data['choice'] == 1, 'money_left']

data.loc[data['choice'] == 2, 'earnings'] = data.loc[data['choice'] == 2, 'money_right']

# now loop
for participant in range(1, 3):

  # get this participant's data
  person_data = data[data['participant'] == participant]

  # drop missing earnings
  valid_earnings = person_data['earnings'].dropna()

  # calculate average earnings
  average_earnings = valid_earnings.mean()

  # count delayed vs sooner
  delayed_chose = (person_data['delayed_opt_chose'] == 1).sum()
  sooner_chose  = (person_data['delayed_opt_chose'] == 0).sum()

  print(f"Participant {participant}'s average earnings: {average_earnings:.2f}")
  print(f"  chose delayed: {delayed_chose} times")
  print(f"  chose sooner : {sooner_chose} times")
## Upload solution to Github

## Extra
## These are hard exercises - not homework, for extra practice

In [None]:
# Hard

# Here simulate your own Delay Discounting Task and calculate some average metrics

In [None]:
# Very hard
# One outcome of interest is the discount rate, k, which denotes extent to which someone discounts
# value of delayed rewards (higher values = less patient)

# Here you can use chatGPT to get the formula for k - see whether you can calculate for each person
# in your dataset