In [7]:
import pandas as pd
import random
import string

# Function to generate random email
def generate_email():
    prefix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))
    return f"{prefix}@gmail.com", prefix

# Function to generate random consent (0 or 1)
def generate_consent():
    return random.choice([0, 1])

# Function to generate random age (14-99)
def generate_age():
    return random.randint(14, 99)

# Function to generate random sex
def generate_sex():
    return random.choice(['Female', 'Male', 'Other/No Response'])

# Function to generate random ethnicity
def generate_ethnicity():
    return random.choice(['Hispanic/Latino', 'Not Hispanic/Latino', 'No Response'])

# Function to generate random race
def generate_race():
    return random.choice(['American Indian/Alaska Native', 'Asian', 'Native Hawaiian/Pacific Islander', 
                          'Black/African American', 'White/Caucasian', 'More than one race', 'No Response'])

# Function to generate random VVIQ responses (1-4)
def generate_vviq_responses():
    return [random.randint(1, 4) for _ in range(16)]

# Function to generate random ideal participation (0 or 1)
def generate_ideal_participation():
    return random.choice([0, 1])

# Function to generate random attention check task
def generate_attention_check_task():
    return random.choice(['Pictures', 'Abstract words', 'Concrete words', 'Numbers', 'Symbols'])

# Main function to generate the Parquet files
def generate_parquet_files(num_files):
    for _ in range(num_files):
        email, filename = generate_email()
        consent = generate_consent()
        age = generate_age()
        sex = generate_sex()
        ethnicity = generate_ethnicity()
        race = generate_race()
        vviq_responses = generate_vviq_responses()
        ideal_participation = generate_ideal_participation()
        attention_check_task = generate_attention_check_task()

        # Define the column names
        columns = ['EmailAddress', 'FutureContactConsent', 'Age', 'Sex', 'Ethnicity', 'Race'] + \
                  [f'VVIQ_Q{i+1}' for i in range(16)] + ['IdealParticipation', 'AttentionCheckTask']

        # Define the row data
        row = [email, consent, age, sex, ethnicity, race] + vviq_responses + [ideal_participation, attention_check_task]

        # Create a DataFrame
        df = pd.DataFrame([row], columns=columns)

        # Write to Parquet file
        df.to_parquet(f'./data/{filename}.parquet', index=False)


In [8]:
# Generate Parquet files
generate_parquet_files(200)

In [9]:
import boto3

# Create S3 client
s3 = boto3.client('s3', region_name='us-east-1')

# Define bucket name
bucket_name = 'exp-data-parquet'

# Create the S3 bucket
try:
    s3.create_bucket(Bucket=bucket_name)
    print(f'Bucket {bucket_name} created successfully.')
except s3.exceptions.BucketAlreadyOwnedByYou:
    print(f'Bucket {bucket_name} already exists and is owned by you.')
except Exception as e:
    print(f'Error creating bucket: {e}')

Bucket exp-data-parquet created successfully.


In [10]:
import os
import subprocess

# Function to upload a file to S3 using AWS CLI
def upload_file_to_s3_cli(file_name, bucket):
    command = f"aws s3 cp {file_name} s3://{bucket}/"
    try:
        result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(result.stdout.decode('utf-8'))
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error uploading {file_name}: {e.stderr.decode('utf-8')}")
        return False

# Directory containing Parquet files
directory = './data/'

# S3 bucket name
bucket_name = 'exp-data-parquet'

# Upload all Parquet files in the directory to the S3 bucket
for filename in os.listdir(directory):
    if filename.endswith(".parquet"):
        file_path = os.path.join(directory, filename)
        success = upload_file_to_s3_cli(file_path, bucket_name)
        if success:
            print(f"Successfully uploaded {filename} to {bucket_name}")
        else:
            print(f"Failed to upload {filename}")


upload: data/in0ygjwyzi.parquet to s3://exp-data-parquet/in0ygjwyzi.parquet

Successfully uploaded in0ygjwyzi.parquet to exp-data-parquet
upload: data/70z5w73hrx.parquet to s3://exp-data-parquet/70z5w73hrx.parquet

Successfully uploaded 70z5w73hrx.parquet to exp-data-parquet
upload: data/gu4zap65kq.parquet to s3://exp-data-parquet/gu4zap65kq.parquet

Successfully uploaded gu4zap65kq.parquet to exp-data-parquet
upload: data/0yofabs7se.parquet to s3://exp-data-parquet/0yofabs7se.parquet

Successfully uploaded 0yofabs7se.parquet to exp-data-parquet
upload: data/48mq5t4v6e.parquet to s3://exp-data-parquet/48mq5t4v6e.parquet

Successfully uploaded 48mq5t4v6e.parquet to exp-data-parquet
upload: data/sbu3uy8wre.parquet to s3://exp-data-parquet/sbu3uy8wre.parquet

Successfully uploaded sbu3uy8wre.parquet to exp-data-parquet
upload: data/zixlh6hbh5.parquet to s3://exp-data-parquet/zixlh6hbh5.parquet

Successfully uploaded zixlh6hbh5.parquet to exp-data-parquet
upload: data/2tmd1xetkv.parquet to