## Ingestion: Kaggle → S3
Pulls raw datasets from Kaggle and writes them to S3 raw layer

Sources: asaniczka/mountain-climbing-accidents-dataset, siddharth0935/himalayan-expeditions

Destination: s3://secret/raw/

In [0]:
# ── INSTALL ──────────────────────────────────────────
%pip install kagglehub[pandas-datasets]

In [0]:
%run /Workspace/Repos/nikum.vedansh@gmail.com/himalayan-expeditions-project/0_scripts/configs/credentials

In [0]:
%run /Workspace/Repos/nikum.vedansh@gmail.com/himalayan-expeditions-project/0_scripts/configs/config

In [0]:
# ── IMPORTS ──────────────────────────────────────────
import kagglehub
from kagglehub import KaggleDatasetAdapter

import boto3
from io import StringIO
import os

In [0]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    region_name=os.environ["AWS_DEFAULT_REGION"]
)

In [0]:
def file_exists_in_s3(bucket, key):
    try:
        s3_client.head_object(Bucket=bucket, Key=key)
        return True
    except:
        return False

In [0]:
print("Starting ingestion...\n")
uploaded = []
skipped = []
failed = []

for dataset in DATASETS:
    try:
        key = dataset["s3_path"].replace(f"s3://{S3_BUCKET}/", "") + dataset["file"]
        
        if file_exists_in_s3(S3_BUCKET, key):
            skipped.append(dataset["file"])
            continue
        
        pandas_df = kagglehub.dataset_load(
            KaggleDatasetAdapter.PANDAS,
            dataset["kaggle_id"],
            dataset["file"],
            pandas_kwargs={"encoding": dataset["encoding"]}
        )
        
        if dataset["encoding"] == "latin1":
            pandas_df = pandas_df.apply(
                lambda x: x.str.encode('latin1').str.decode('utf-8', errors='ignore') 
                if x.dtype == 'object' else x
            )
        
        csv_buffer = StringIO()
        pandas_df.to_csv(csv_buffer, index=False, encoding='utf-8')
        
        s3_client.put_object(
            Bucket=S3_BUCKET,
            Key=key,
            Body=csv_buffer.getvalue()
        )
        
        uploaded.append(dataset["file"])

    except Exception as e:
        failed.append(dataset["file"])
        print(f"❌ Failed {dataset['file']}: {str(e)}")

def print_summary(label, files):
    print(f"{label} ({len(files)}):")
    if files:
        for f in files:
            print(f"   - {f}")
    else:
        print("   - none")
    print()

print_summary("✅ Uploaded", uploaded)
print_summary("⏩ Skipped", skipped)
print_summary("❌ Failed",   failed)
print("✅ Ingestion complete")