In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import duckdb
import boto3
import os
import pandas as pd
import requests
from pathlib import Path

import cmf.locations as loc

r = requests.get(
    'http://169.254.170.2' + 
    os.environ['AWS_CONTAINER_CREDENTIALS_RELATIVE_URI']
)

AWS_CREDS = r.json()
HTTPFS_PATH = loc.PROJECT_DIR / 'scratch' / 'httpfs.duckdb_extension'

## Read from team S3: `boto`

In [3]:
client = boto3.client('s3', region_name=os.environ['S3_REGION']) 
response = client.get_object(
    Bucket='jupyter.notebook.uktrade.io', 
    Key=os.environ['S3_PREFIX_TEAM_DDAT_DATA_SCIENCE'] + '.tmp/dummy.csv'
) 
df = pd.read_csv(response['Body'])
df

Unnamed: 0,foo,bar
0,a,1
1,b,2
2,c,3


## Read/write from team S3: `duckdb`

In [4]:
con = duckdb.connect()

In [5]:
# via https://duckdb.org/docs/extensions/httpfs.html

con.query(f"""
    install '{HTTPFS_PATH.resolve()}';
    load '{HTTPFS_PATH.resolve()}';
    set s3_region='{os.environ['S3_REGION']}';
    set s3_access_key_id='{AWS_CREDS['AccessKeyId']}';
    set s3_secret_access_key='{AWS_CREDS['SecretAccessKey']}';
    set s3_session_token='{AWS_CREDS['Token']}';
""")

In [6]:
con.query(f"""
    select *
    from 's3://{
        '/'.join([
            os.environ['S3_BUCKET'],
            os.environ['S3_PREFIX_TEAM_DDAT_DATA_SCIENCE'][:-1],
            '.tmp',
            'dummy.csv'
        ])
    }';
""")

┌─────────┬───────┐
│   foo   │  bar  │
│ varchar │ int64 │
├─────────┼───────┤
│ a       │     1 │
│ b       │     2 │
│ c       │     3 │
└─────────┴───────┘

In [7]:
df = pd.DataFrame({'col1': ['alpha', 'beta'], 'col2': [3.14, 2.72]})

In [8]:
con.query(f"""
    copy df
    to 's3://{
        '/'.join([
            os.environ['S3_BUCKET'],
            os.environ['S3_PREFIX_TEAM_DDAT_DATA_SCIENCE'][:-1],
            '.tmp',
            'dummy_out.parquet'
        ])
    }'
""")

In [9]:
con.query(f"""
    select *
    from 's3://{
        '/'.join([
            os.environ['S3_BUCKET'],
            os.environ['S3_PREFIX_TEAM_DDAT_DATA_SCIENCE'][:-1],
            '.tmp',
            'dummy_out.parquet'
        ])
    }';
""")

┌─────────┬────────┐
│  col1   │  col2  │
│ varchar │ double │
├─────────┼────────┤
│ alpha   │   3.14 │
│ beta    │   2.72 │
└─────────┴────────┘

## Use team S3 as temporary `duckdb` storage

Inconclusive, can't force it to use the S3 temp. Let's try it in production.

In [10]:
con.query(f"""
    set temp_directory='s3://{
        '/'.join([
            os.environ['S3_BUCKET'],
            os.environ['S3_PREFIX_TEAM_DDAT_DATA_SCIENCE'][:-1],
            '.tmp'
        ])
    }';
    select current_setting('temp_directory');
""")

┌─────────────────────────────────────────────────────────────────────┐
│                  current_setting('temp_directory')                  │
│                               varchar                               │
├─────────────────────────────────────────────────────────────────────┤
│ s3://jupyter.notebook.uktrade.io/teams/_team_ddat_data_science/.tmp │
└─────────────────────────────────────────────────────────────────────┘

In [11]:
# set memory_limit='0.01GB';
# reset memory_limit;
con.query("""
    select current_setting('memory_limit');   
""")

┌─────────────────────────────────┐
│ current_setting('memory_limit') │
│             varchar             │
├─────────────────────────────────┤
│ 26.4GB                          │
└─────────────────────────────────┘

In [14]:
con.query(f"""
    select
        *
    from
        '{
            '/'.join([
                loc.DATA_SUBDIR['processed'],
                'company-matching__full',
                'hmrc_trade__exporters.parquet'
            ])
    }'
    limit 5;
""")

┌───────────────┬───────────┬────────────────┬───┬────────────┬─────────────────┬────────────────┐
│ postcode_area │ unique_id │ comp_num_clean │ … │  name_sig  │ name_sig_first5 │ name_sig_last5 │
│    varchar    │  varchar  │     double     │   │  varchar   │     varchar     │    varchar     │
├───────────────┼───────────┼────────────────┼───┼────────────┼─────────────────┼────────────────┤
│ NE            │ 1         │           NULL │ … │ clzbldjmmg │ clzbl           │ djmmg          │
│ SG            │ 2         │           NULL │ … │ cluuck     │ cluuc           │ luuck          │
│ GU            │ 3         │           NULL │ … │ mdclg      │ mdclg           │ mdclg          │
│ SE            │ 4         │           NULL │ … │ cplddf     │ cpldd           │ plddf          │
│ NP            │ 5         │           NULL │ … │ fvpduc     │ fvpdu           │ vpduc          │
├───────────────┴───────────┴────────────────┴───┴────────────┴─────────────────┴────────────────┤
│ 5 rows  