In [70]:
!pip install polars



In [71]:
!pip install s3fs



In [72]:
!pip install awswrangler



In [73]:
import polars as pl
from polars import DataFrame
import s3fs
import boto3
import awswrangler as wr 
from awswrangler.typing import GlueTableSettings

In [74]:
datasets = {
    "service_history" : f"s3://query-gen-hackathon/sample_data_employee/service_history_dataset.csv",
    "employees": f"s3://query-gen-hackathon/sample_data_employee/employee.csv",
    "departments": f"s3://query-gen-hackathon/sample_data_employee/department.csv",
    "enrollments": f"s3://query-gen-hackathon/sample_data_employee/enrollment.csv",
    "services": f"s3://query-gen-hackathon/sample_data_employee/services.csv"
}

In [75]:
def create_table(bucket_path, table):
    fs = s3fs.S3FileSystem()
    with fs.open(bucket_path, mode="rb") as f:
        df = pl.read_csv(f, infer_schema_length=1000)
        df.head()
        s3_session = boto3.Session(region_name="us-east-1")
        catalog_id = "529088288102"
        wr.catalog.delete_table_if_exists(database="finetune_llm_querygen", table=table, catalog_id=catalog_id,boto3_session=s3_session)
        wr.s3.to_parquet(
            df=df.to_pandas(),
            path=f"s3://query-gen-hackathon/sample_data_employee/output/{table}/",
            index=False,
            compression="snappy",
            use_threads=True,
            boto3_session=s3_session,
            dataset=True,
            mode="overwrite_partitions",
            schema_evolution=True,
            database="finetune_llm_querygen",
            table=table,
            catalog_id=catalog_id,
            glue_table_settings=GlueTableSettings(
                table_type="EXTERNAL_TABLE",
                description="department table"
            ),
        )
        
        

In [76]:
for table, bucket_path in datasets.items():
    print(table, bucket_path)
    create_table(bucket_path, table)

service_history s3://query-gen-hackathon/sample_data_employee/service_history_dataset.csv
employees s3://query-gen-hackathon/sample_data_employee/employee.csv
departments s3://query-gen-hackathon/sample_data_employee/department.csv
enrollments s3://query-gen-hackathon/sample_data_employee/enrollment.csv
services s3://query-gen-hackathon/sample_data_employee/services.csv
