In [None]:
import asyncio
import aioboto3
from botocore.exceptions import NoCredentialsError, ClientError
import pandas as pd

In [None]:
import sys
from pathlib import Path

# Add the parent directory (repo root) to Python path
repo_root = Path.cwd().parent
sys.path.insert(0, str(repo_root))

# Force reload the module to get the latest version
import importlib
import src.utils.data_processing

importlib.reload(src.utils.data_processing)

from src.utils.data_processing import (
    process_file,
    prepare_data_for_analysis,
    process_multiple_files,
)

In [None]:
async def list_s3_fit_files(bucket_name="project-traco-benchmarking",
                           prefix="fit_files/",
                           profile_name="project-traco"):
    """
    List all .fit files from S3 bucket using aioboto3
    """
    fit_files = []
    session = aioboto3.Session(profile_name=profile_name)

    try:
        async with session.client('s3') as s3:
            paginator = s3.get_paginator('list_objects_v2')
            page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

            async for page in page_iterator:
                if 'Contents' in page:
                    for obj in page['Contents']:
                        key = obj['Key']
                        if key.lower().endswith('.fit'):
                            # ETag is typically the MD5 hash, but remove quotes if present
                            etag = obj.get('ETag', '').strip('"')

                            fit_files.append({
                                'key': key,
                                'filename': key.split('/')[-1],
                                'size_mb': round(obj['Size'] / (1024 * 1024), 2),
                                'last_modified': obj['LastModified'],
                                'etag': etag  # MD5 hash from S3 ETag
                            })

    except NoCredentialsError:
        print("Error: AWS credentials not found. Make sure your profile is configured correctly.")
        return []
    except ClientError as e:
        print(f"Error accessing S3: {e}")
        return []
    except Exception as e:
        print(f"Unexpected error: {e}")
        return []

    return sorted(fit_files, key=lambda x: x['key'])

# Run the async function and list all .fit files
fit_files = await list_s3_fit_files()

print(f"Found {len(fit_files)} .fit files in s3://project-traco-benchmarking/fit_files/")
print("=" * 80)

for i, file_info in enumerate(fit_files, 1):
    print(f"{i:3d}. {file_info['filename']}")
    print(f"     Path: {file_info['key']}")
    print(f"     Size: {file_info['size_mb']} MB")
    print(f"     etag: {file_info['etag']}")
    print(f"     Modified: {file_info['last_modified']}")
    print()

if fit_files:
    total_size = sum(f['size_mb'] for f in fit_files)
    print("=" * 80)
    print(f"Total: {len(fit_files)} files, {total_size:.2f} MB")

In [None]:
# Test with a single S3 file (using first file from our list)
if fit_files:
    first_file = fit_files[0]
    s3_url = f"s3://project-traco-benchmarking/{first_file['key']}"

    print(f"Testing S3 file processing with: {s3_url}")
    print("This will download the file temporarily and process it...")

    try:
        df_dict = process_file(s3_url, aws_profile="project-traco")

        print(f"\nSuccess! Processed file: {first_file['filename']}")
        print(f"Session data shape: {df_dict['session'].shape}")
        print(f"Record data shape: {df_dict['records'].shape}")
        print(f"Available columns: {list(df_dict['records'].columns)}")

        # Show a sample of the data
        print(f"\nFirst 3 records:")
        print(df_dict['records'].head(3))

    except Exception as e:
        print(f"Error: {e}")
else:
    print("No files available to test")

In [None]:
all_fit_data = process_multiple_files(
    [f"s3://project-traco-benchmarking/{f['key']}" for f in fit_files],
    aws_profile="project-traco"
)

len(all_fit_data)

In [None]:
all_fit_data[0]['file_id']

In [None]:
# All data with the product_name "Polar Pacer"
polar_pacer_data = [
    d for d in all_fit_data if "product_name" in d["file_id"] and d["file_id"]["product_name"][0] == "Polar Pacer"
]

len(polar_pacer_data)

In [None]:
catalogue_df = pd.DataFrame(fit_files)
catalogue_df['device_type'] = catalogue_df['filename'].apply(lambda x: 'test' if 'pacer' in x else 'ref')

catalogue_df["tags"] = catalogue_df["key"].apply(
    lambda x: [x.split("/")[1]] if len(x.split("/")) > 1 else "unknown"
)

catalogue_df = catalogue_df[['etag', 'key','size_mb', 'device_type', 'tags']]

catalogue_df