In [4]:
import boto3
import pandas as pd

# Initialize boto3 S3 client
s3 = boto3.client('s3')

# Define bucket and file names
destination_bucket = 'aggregated-exp-data-parquet'
aggregated_file = 'aggregated_exp_data.parquet'
new_file = 'potential_future_aphan_participants.parquet'

# Local paths for temporary storage
local_aggregated_path = '/tmp/' + aggregated_file
local_new_file_path = '/tmp/' + new_file

# Download the aggregated Parquet file from S3
def download_aggregated_data():
    s3.download_file(destination_bucket, aggregated_file, local_aggregated_path)
    return pd.read_parquet(local_aggregated_path)

# Upload the new Parquet file to S3
def upload_new_file(df):
    df.to_parquet(local_new_file_path, index=False)
    s3.upload_file(local_new_file_path, destination_bucket, new_file)

def upload_filtered_data(df):
    df.to_parquet(local_aggregated_path, index=False)
    s3.upload_file(local_aggregated_path, destination_bucket, aggregated_file)


In [5]:
def filter_potential_future_aphan_participants(df, cutoff):
    filtered_df = df[(df['FutureContactConsent'] == 1) & (df['VVIQ_Score'] <= cutoff)]
    print("Filtered rows for potential future aphantasic participants:")
    print(filtered_df['EmailAddress'].tolist())
    return filtered_df

# Set the AphantasicCutoff value
AphantasicCutoff = 25

# Download, filter, and upload the new file
df = download_aggregated_data()
filtered_df = filter_potential_future_aphan_participants(df, AphantasicCutoff)
upload_new_file(filtered_df)


Filtered rows for potential future aphantasic participants:
['0yofabs7se@gmail.com', '0z710d02cc@gmail.com', '4l8eow9pc5@gmail.com', '9tmhqldp29@gmail.com', 'afdd56ach1@gmail.com', 'cxu7luypuf@gmail.com', 'd0j667ohs4@gmail.com', 'dwlgyfg2jl@gmail.com', 'erwvftxrc5@gmail.com', 'gf8igyszdt@gmail.com', 'h3pusayhz6@gmail.com', 'h7i1gd26hd@gmail.com', 'h9e681stj9@gmail.com', 'lno53dakx5@gmail.com', 'm8hdba3nfe@gmail.com', 'rj2mjtih4z@gmail.com', 't51zn0n1na@gmail.com', 'v9kgvi16a3@gmail.com', 'xntinjn90u@gmail.com', 'xrvunj7ivi@gmail.com', 'yi6684t409@gmail.com']


In [6]:
def check_file_exists(bucket_name, file_key):
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
        print(f"File '{file_key}' successfully created in bucket '{bucket_name}'.")
        return True
    except:
        print(f"File '{file_key}' does not exist in bucket '{bucket_name}'.")
        return False

# Verify the new file
check_file_exists(destination_bucket, new_file)


File 'potential_future_aphan_participants.parquet' successfully created in bucket 'aggregated-exp-data-parquet'.


True

In [7]:
def remove_rows_with_missing_values(df):
    missing_value_rows = df[df.isnull().any(axis=1)]
    if not missing_value_rows.empty:
        print("Rows removed due to missing values:")
        print(missing_value_rows['EmailAddress'].tolist())
    df = df.dropna()
    return df

# Download the data, apply the filter, and upload the result
df = download_aggregated_data()
df = remove_rows_with_missing_values(df)
upload_filtered_data(df)


In [8]:
def remove_rows_with_incorrect_attention_check(df, correct_answer):
    incorrect_attention_rows = df[df['AttentionCheckTask'] != correct_answer]
    if not incorrect_attention_rows.empty:
        print("Rows removed due to incorrect attention check answers:")
        print(incorrect_attention_rows['EmailAddress'].tolist())
    df = df[df['AttentionCheckTask'] == correct_answer]
    return df

# Set the correct attention check answer
AttentionCheckAnswer = "Numbers" 

# Apply the filter
df = download_aggregated_data()
df = remove_rows_with_incorrect_attention_check(df, AttentionCheckAnswer)
upload_filtered_data(df)


Rows removed due to incorrect attention check answers:
['07e9dl4p4m@gmail.com', '0tff6mmdws@gmail.com', '0yofabs7se@gmail.com', '19wspycaqb@gmail.com', '1czlbmtpxs@gmail.com', '1i4dgd97m3@gmail.com', '2gcy5cm42i@gmail.com', '2jth6vvh0d@gmail.com', '2syoniq5tq@gmail.com', '2z8cq3s5rd@gmail.com', '38ml5hf0dx@gmail.com', '3e32vhjzy1@gmail.com', '3eu43sem6r@gmail.com', '3exkl8jxmf@gmail.com', '3gnt8z97ub@gmail.com', '3mvrbcwto0@gmail.com', '3smyws11s4@gmail.com', '3wx2etrpt6@gmail.com', '3x1ri3r8kg@gmail.com', '3xet56l298@gmail.com', '3xonogz2k9@gmail.com', '3y8d6aejaj@gmail.com', '48mq5t4v6e@gmail.com', '4enptdoyoa@gmail.com', '4l8eow9pc5@gmail.com', '4o5opcpx2b@gmail.com', '53yt5qrkcw@gmail.com', '5gq2fahesc@gmail.com', '5gx7uw6y1m@gmail.com', '5zzfim1vwr@gmail.com', '64uy9byg3i@gmail.com', '6acj8jzbwd@gmail.com', '6i01pra377@gmail.com', '6j7tb8t04r@gmail.com', '6jpcr837v4@gmail.com', '70z5w73hrx@gmail.com', '726tb97mzg@gmail.com', '7dvv3ss6ny@gmail.com', '7eylttq5nq@gmail.com', '7rd31d5

In [9]:
def remove_rows_with_unideal_participation(df):
    incorrect_participation_rows = df[df['IdealParticipation'] != 1]
    if not incorrect_participation_rows.empty:
        print("Rows removed due to not participating under ideal conditions:")
        print(incorrect_participation_rows['EmailAddress'].tolist())
    df = df[df['IdealParticipation'] == 1]
    return df

# Apply the filter
df = download_aggregated_data()
df = remove_rows_with_unideal_participation(df)
upload_filtered_data(df)


Rows removed due to not participating under ideal conditions:
['0ew2g7jfla@gmail.com', '0z710d02cc@gmail.com', '2amk5u3cgd@gmail.com', '2c1z3zfvrj@gmail.com', '2tmd1xetkv@gmail.com', '396vvsn8hn@gmail.com', '3r2l39udlp@gmail.com', '3ws86w85le@gmail.com', '4auxgl5oja@gmail.com', '4nar8btra5@gmail.com', '5qnht75xjy@gmail.com', '6o8t8q4b9j@gmail.com', '711o35o9p4@gmail.com', '7p5niy1gtu@gmail.com', '7yd3rs0y65@gmail.com', 'afdd56ach1@gmail.com', 'aqj21awgbf@gmail.com', 'bu0g3hrz1n@gmail.com', 'd0j667ohs4@gmail.com', 'dluorwikdk@gmail.com', 'dw0hrzr11b@gmail.com', 'eecxg1q25q@gmail.com', 'fdzgdgbx1e@gmail.com', 'fezqwgrthi@gmail.com', 'gf8igyszdt@gmail.com', 'h7i1gd26hd@gmail.com', 'hz3m1e0tac@gmail.com', 'i29ui2ssd9@gmail.com', 'ilmelxb2gl@gmail.com', 'in0ygjwyzi@gmail.com', 'l27x4ll7nz@gmail.com', 'la2jrbcf0o@gmail.com', 'lg1b3ebgy4@gmail.com', 'muzom0tm6x@gmail.com', 'qril0pv7t1@gmail.com', 'rlanxfd9h8@gmail.com', 'ryjoedcwup@gmail.com', 'sarluoizuz@gmail.com', 'sxrl60bne2@gmail.com', '

In [10]:
def remove_rows_with_out_of_range_age(df, min_age, max_age):
    out_of_range_age_rows = df[(df['Age'] < min_age) | (df['Age'] > max_age)]
    if not out_of_range_age_rows.empty:
        print("Rows removed due to age not meeting the criteria:")
        print(out_of_range_age_rows['EmailAddress'].tolist())
    df = df[(df['Age'] >= min_age) & (df['Age'] <= max_age)]
    return df

# Set the age criteria
MinAge = 18
MaxAge = 70

# Apply the filter
df = download_aggregated_data()
df = remove_rows_with_out_of_range_age(df, MinAge, MaxAge)
upload_filtered_data(df)


Rows removed due to age not meeting the criteria:
['0qa175qzmn@gmail.com', '4q052hx31s@gmail.com', '7ckrljnwo3@gmail.com', '7vum8xytf9@gmail.com', 'b0iokv2vul@gmail.com', 'i517p0qqoy@gmail.com', 'i6zk0q6jaf@gmail.com', 'opgab563e6@gmail.com', 'rqqnpjq57k@gmail.com', 'wiotnlup7t@gmail.com', 'xdq687wdte@gmail.com', 'ygechsi2jr@gmail.com', 'z5s0rjsm4p@gmail.com']


In [11]:
import boto3
import pandas as pd
from io import BytesIO

# Set parameters
destination_bucket = 'aggregated-exp-data-parquet'
aggregated_file = 'aggregated_exp_data.parquet'
AphantasicCutoff = 25
HyperphantasicCutoff = 55

# Create S3 client
s3 = boto3.client('s3')

# Download the aggregated Parquet file from S3
obj = s3.get_object(Bucket=destination_bucket, Key=aggregated_file)
data = obj['Body'].read()

# Read the Parquet file content into a DataFrame
df = pd.read_parquet(BytesIO(data))

# Filter data and create new files
aphantasic_participants = df[df['VVIQ_Score'] <= AphantasicCutoff]
typical_imagers = df[(df['VVIQ_Score'] > AphantasicCutoff) & (df['VVIQ_Score'] < HyperphantasicCutoff)]
hyperphantasic_participants = df[df['VVIQ_Score'] >= HyperphantasicCutoff]

# Print the EmailAddress values for each filtered group
print("Aphantasic Participants' Email Addresses:")
print(aphantasic_participants['EmailAddress'].values)

print("Typical Imagers' Email Addresses:")
print(typical_imagers['EmailAddress'].values)

print("Hyperphantasic Participants' Email Addresses:")
print(hyperphantasic_participants['EmailAddress'].values)

# Convert filtered DataFrames to Parquet bytes
aphantasic_parquet = BytesIO()
typical_imagers_parquet = BytesIO()
hyperphantasic_parquet = BytesIO()

aphantasic_participants.to_parquet(aphantasic_parquet, index=False)
typical_imagers.to_parquet(typical_imagers_parquet, index=False)
hyperphantasic_participants.to_parquet(hyperphantasic_parquet, index=False)

# Reset buffer positions to the beginning
aphantasic_parquet.seek(0)
typical_imagers_parquet.seek(0)
hyperphantasic_parquet.seek(0)

# Upload the new files to S3
s3.put_object(Bucket=destination_bucket, Key='aphantasic_participants.parquet', Body=aphantasic_parquet.getvalue())
s3.put_object(Bucket=destination_bucket, Key='typical_imagers.parquet', Body=typical_imagers_parquet.getvalue())
s3.put_object(Bucket=destination_bucket, Key='hyperphantasic_participants.parquet', Body=hyperphantasic_parquet.getvalue())


Aphantasic Participants' Email Addresses:
['7f8gvj6hhc@gmail.com' 'erwvftxrc5@gmail.com' 'm8hdba3nfe@gmail.com'
 'u61oxdvxxa@gmail.com' 'xntinjn90u@gmail.com' 'xrvunj7ivi@gmail.com'
 'ypirj6oxha@gmail.com']
Typical Imagers' Email Addresses:
['18um8nliha@gmail.com' '1hscz1e9gu@gmail.com' '2ue4j4z15b@gmail.com'
 '9x8r3yxn9y@gmail.com' 'amxepijc5p@gmail.com' 'bikk7gl5to@gmail.com'
 'climvbet61@gmail.com' 'fuqip725e4@gmail.com' 'gb1vz651g8@gmail.com'
 'hifdltf66y@gmail.com' 'k15thshzyp@gmail.com' 'r0v6p2gfz5@gmail.com'
 's0thu77s6g@gmail.com' 'sq5n5jbqjl@gmail.com' 'ssb27wwroc@gmail.com'
 'ul1ixkkn2i@gmail.com' 'wi1pqcssne@gmail.com' 'yrrdo2idps@gmail.com'
 'z7eef6qj8u@gmail.com']
Hyperphantasic Participants' Email Addresses:
['0jiwb7eyeq@gmail.com' '487paxn911@gmail.com' '6u22yscwh1@gmail.com'
 'gqy3mtffco@gmail.com' 'hate3kpzc8@gmail.com' 'sbu3uy8wre@gmail.com']


{'ResponseMetadata': {'RequestId': 'AGD2AFR0PNY0GYB8',
  'HostId': 'JQOtpKWvAFrfRGzFzTIMQf7WRCLT3PABmNWiFwUGS9UdlvWYqqZWF5XWC9/WSJsB68iK8XhaQMM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JQOtpKWvAFrfRGzFzTIMQf7WRCLT3PABmNWiFwUGS9UdlvWYqqZWF5XWC9/WSJsB68iK8XhaQMM=',
   'x-amz-request-id': 'AGD2AFR0PNY0GYB8',
   'date': 'Wed, 22 May 2024 23:19:20 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"699bd7431de639f1059a9c0e9ebaa12d"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"699bd7431de639f1059a9c0e9ebaa12d"',
 'ServerSideEncryption': 'AES256'}