# Local processing

In [1]:
import dask.dataframe as dd

# Read the edges TSV file into a Dask DataFrame
edges_df = dd.read_csv('data/edges.tsv', sep='\t', names=['source', 'target'])

# Create graph incidence list by grouping by source column
# This will give us for each source node all its target nodes
graph_incidence = edges_df.groupby('source')['target'].apply(
    lambda x: list(x),
    meta=('target', 'object')  # specify meta for proper schema inference
).compute()  # compute to bring result into memory

print("First few entries of the graph incidence list:")
print(graph_incidence.head())

First few entries of the graph incidence list:
source
1              [762]
2    [578, 282, 845]
5         [233, 405]
8               [81]
9          [650, 17]
Name: target, dtype: object


# Distributed processing

## Setting up Dask on Google Cloud Platform

To run Dask on GCP, we need:
1. Google Cloud SDK installed and configured
2. A GCP project with the required APIs enabled:
   - Compute Engine API
   - Cloud Resource Manager API
3. Authentication set up 

The following code will set up a Dask cluster on GCP.

In [None]:
# Authenticate and create ADC file
!gcloud auth application-default login

# Set the project ID
!gcloud config set project $GCP_PROJECT_ID

# Set the environment variable for Google Application Credentials
!export  GOOGLE_APPLICATION_CREDENTIALS="$HOME/.config/gcloud/application_default_credentials.json"




In [None]:
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client
import os

# Configure your GCP project and zone from environment variables
project = os.environ.get('GCP_PROJECT_ID')
zone = os.environ.get('GCP_ZONE', 'us-east1-b')      # Default to us-central1-a if not set

if not project:
    raise ValueError("GCP_PROJECT_ID environment variable not set. Please set it in your .env file.")

# Create a GCP cluster
cluster = GCPCluster(
    projectid=project,
    zone=zone,
    n_workers=0,  # Number of worker nodes
    machine_type="n1-standard-1",  # Machine type for workers
    # filesystem_size=50,           # Disk size in GB
    preemptible=False,
    # worker_class="dask_cloudprovider.gcp.GCPWorker",
    debug=True,
    docker_image="daskdev/dask:latest-py3.11",
    docker_args='-e EXTRA_PIP_PACKAGES="dask-cloudprovider httplib2 cryptography google-api-python-client"',
)

# Create a Dask client
client = Client(cluster)

# Print cluster dashboard link
print(f"Dask dashboard available at: {client.dashboard_link}")

In [2]:
cluster.get_logs()

In [None]:
cluster.scale_up(1)

## Test with simple task

In [3]:
import dask.array as da

arr = da.random.random((1000, 1000), chunks=(100, 100))
arr.mean().compute()

np.float64(0.49965013822858817)

## Test with realistic task (small dataset)

In [None]:
import dask.dataframe as dd

# Read the edges TSV file into a Dask DataFrame
edges_df = dd.read_csv('https://storage.googleapis.com/labofone-notebooks/edges.tsv', sep='\t', names=['source', 'target'])

# Create graph incidence list by grouping by source column
# This will give us for each source node all its target nodes
graph_incidence = edges_df.groupby('source')['target'].apply(
    lambda x: list(x),
    meta=('target', 'object')  # specify meta for proper schema inference
).compute()  # compute to bring result into memory

print("First few entries of the graph incidence list:")
print(graph_incidence.head())

In [4]:
cluster.close()

Closing Instance: dask-c79cbc2e-scheduler


## Match environments

In [None]:
!uv add "lz4==4.3.3" "toolz==0.12.0" "tornado==6.5.1"


## Next steps:

- environment matching failed
- try running in development container
- creating new docker image for scheduler/worker
- try [condaInstall plugin](https://distributed.dask.org/en/stable/plugins.html#built-in-scheduler-plugins) 
- add [data-specific dependencies](https://docs.dask.org/en/latest/deploying-cloud.html)
- fix [data reading](https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#google-cloud-storage) config
- test with setup using [ssh](https://docs.dask.org/en/latest/deploying-ssh.html)
- customize and extend [docker images](https://docs.dask.org/en/latest/deploying-docker.html)

In [None]:
#