# Access to data with `dask` framework

## Libraries

In [None]:
import os
import re
import boto3
import json
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress

## Credentials and dask client

In [None]:
BUCKET = 'miba-projects-21-22-sites'
with open('access.json') as file:
    access_data = json.load(file)

In [None]:
client = Client(
    n_workers=4,
    threads_per_worker=1,
    memory_limit='8GB'
)
print(
    'Dask dashboard available at:',
    'https://jhas01.gsom.spbu.ru{}proxy/{}/status'.format(
        os.environ['JUPYTERHUB_SERVICE_PREFIX'],
        client.scheduler_info()['services']['dashboard']
    )
)
client

## List files with `boto3`

In [None]:
session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    aws_access_key_id=access_data['aws_access_key_id'],
    aws_secret_access_key=access_data['aws_secret_access_key'],
    endpoint_url='http://storage.yandexcloud.net'
)

In [None]:
all_files = [key['Key'] for key in s3.list_objects(Bucket=BUCKET)['Contents']]
print('first files:', all_files[:10])

## Single file

In [None]:
file_path = f's3://{BUCKET}/{all_files[1]}'
print('file path to load:', file_path)
ddf = dd.read_csv(
    file_path,
    storage_options={
        'key': access_data['aws_access_key_id'],
        'secret': access_data['aws_secret_access_key'],
        'client_kwargs': {
            'endpoint_url': 'http://storage.yandexcloud.net',
        }
    },
    assume_missing=True,
    sep=' ',
    header=None
)
print('partitions:', ddf.npartitions)
ddf.describe()

In [None]:
ddf.head()

In [None]:
%%time
ddf[0].count().compute()

In [None]:
%%time
ddf[0].unique().compute()