# Loading And Storing Data From And Into S3 Running Open-MPI Job

This test generates a chunk of data, uploading it to S3 and then process it in a `job` and in a `mpijob`. Later it will verify that:

* The data was handled properly and results were equal.
* The stored dataset artifact in S3 is loadable and equal.
* The mpijob run was faster (only possible on big data).

## General Configurations

In [None]:
!pip install ipywidgets tqdm

In [None]:
import os, random
import shutil
import sys

sys.path.append(os.path.abspath("../"))

from utils import S3Client

# AWS Credentials:
AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID", "")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
assert AWS_ACCESS_KEY_ID != "" and AWS_SECRET_ACCESS_KEY != "" 
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY

# Path to store the generated data:
LOCAL_DATA_PATH = "./data"
S3_BUCKET = os.environ.get("S3_BUCKET", "testbucket-igz-temp")
S3_PROJECT_DIRECTORY = "test-mpijob-s3-{}".format(str(random.randint(0,10000)))
DATA_PATH = "data_{}".format(str(random.randint(0,10000)))
S3_DATA_PATH = os.path.join(S3_PROJECT_DIRECTORY, DATA_PATH)

# Number of samples of generated data (number of rows in the data table):
N_SAMPLES = 10_000

# Number of features of the generated data (number of columns in the data table):
N_FEATURES = 10

# The amount of ranks to deploy for the open mpi job (used for parquet partitions of the generated data):
N_RANKS = 4

## 1. Generate Data:

1. Generate random data.
2. Turn the data into a `pandas.DataFrame` naming the columns `features_{i}` and adding the partioting column (year).

In [None]:
import numpy as np
import pandas as pd


def generate_data(
    output_path: str,
    n_samples: int, 
    n_features: int, 
    n_partitions: int,
):
    # Generate data:
    data = np.random.random(size=(n_samples, n_features))
    
    # Create a dataframe:
    data = pd.DataFrame(
        data=data, 
        columns=[f"feature_{i}" for i in range(n_features)]
    )
    data["year"] = np.random.randint(2000, 2000 + n_partitions, size=n_samples)
    
    # Save to parquets:
    data.to_parquet(output_path, partition_cols=["year"])

Generate the data (will require writing permissions to the local directory and of course to S3).

In [None]:
# Delete past generated data (in case there was a past failure):
if os.path.exists(LOCAL_DATA_PATH):
    shutil.rmtree(os.path.abspath(LOCAL_DATA_PATH))

# Generate new data:
generate_data(
    output_path=LOCAL_DATA_PATH,
    n_samples=N_SAMPLES, 
    n_features=N_FEATURES, 
    n_partitions=N_RANKS,
)

In [None]:
# Create the S3 client:
s3_client = S3Client(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)

# Delete the project and data in S3 (in case there was a past failure):
try:
    s3_client.delete(
        bucket=S3_BUCKET,
        s3_path=S3_PROJECT_DIRECTORY,
    )
except FileNotFoundError:
    pass

# Upload it to S3:
s3_client.upload(
    bucket=S3_BUCKET,
    local_path=LOCAL_DATA_PATH,
    s3_path=S3_DATA_PATH,
    replace=False,
)

In [None]:
# Delete new generated data (data will be loaded from S3):
shutil.rmtree(os.path.abspath(LOCAL_DATA_PATH))

## 2. Data Processing Code

1. Read the data into a pandas (dask) `DataFrame` using MLRun's `DataItem.as_df`'s method.
2. Do some calculations.

The calculations are accumulated into a single value that will be logged as a result along a single column of data (means in this case) to be stored in S3.

In [None]:
# mlrun: start-code

In [None]:
import os
import time
import numpy as np
import mlrun


def process_data(context: mlrun.MLClientCtx, data_path: mlrun.DataItem):
    # Start the timer:
    run_time = time.time()
    
    # Check if 'job' or 'mpijob':
    is_mpijob = context.labels.get("kind", "") == "mpijob"
    
    # Get MPI rank:
    comm = None
    rank = 0
    if is_mpijob:
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
    
    # Get the data:
    if is_mpijob:
        # Set path to the rank's part in the data partition:
        data_path._url = os.path.join(data_path._url, f"year={2000 + rank}")
    data = data_path.as_df(format="parquet")
    
    # Do some random calculations:
    result = 0
    for column in data.columns:
        if column == "year":
            continue
        for _, value in data[column].items():
            result += value
    if is_mpijob:
        print(f"Rank #{rank} result: {result}")
        # Collect the result from all ranks to the root rank (#0):
        result = comm.reduce(result, op=MPI.SUM, root=0)
        
    # Log the values (only from root rank (#0) in mpijob):
    if rank == 0:
        array = np.arange(100 + int(result) % 1000)
        run_time = time.time() - run_time
        return run_time, result, array

In [None]:
# mlrun: end-code

## 3. Create a Project

1. Create the MLRun project.
2. Create an MLRun function of the processing code.

In [None]:
import mlrun

In [None]:
# Create the project:
project = mlrun.get_or_create_project(name=S3_PROJECT_DIRECTORY, context="./", user_project=False)

# Add the S3 credentials:
project.set_secrets(
    secrets={
        "AWS_ACCESS_KEY_ID": AWS_ACCESS_KEY_ID,
        "AWS_SECRET_ACCESS_KEY": AWS_SECRET_ACCESS_KEY,
    }
)

In [None]:
# Create the job function:
job_function = project.set_function(name="process_data_job", kind="job", image="mlrun/mlrun", handler="process_data")

In [None]:
# Create the open mpi function:
mpijob_function = project.set_function(name="process_data_mpijob", kind="mpijob", image="mlrun/mlrun", handler="process_data")
mpijob_function.spec.replicas = N_RANKS

## 4. Run As A Job

Run the processing as a `job` and storing the results.

In [None]:
# Run without dask:
job_run = job_function.run(
    name="process_data_job",
    inputs={
        "data_path": f"s3://{S3_BUCKET}/{S3_PROJECT_DIRECTORY}/{DATA_PATH}/",
    },
    artifact_path=f"s3://{S3_BUCKET}/{S3_PROJECT_DIRECTORY}",
    returns=["time", "result", "array:dataset"],
)

# Store results:
job_time = job_run.status.results['time']
job_result = job_run.status.results['result']
job_array = np.array(job_run.artifact('array').as_df()["0"])

## 5. Run As a MPIJob

Run the processing as a `mpijob` and storing the results.

In [None]:
# Run without dask:
mpijob_run = mpijob_function.run(
    name="process_data_mpijob",
    inputs={
        "data_path": f"s3://{S3_BUCKET}/{S3_PROJECT_DIRECTORY}/{DATA_PATH}/",
    },
    artifact_path=f"s3://{S3_BUCKET}/{S3_PROJECT_DIRECTORY}",
    returns=["time", "result", "array:dataset"],
)

# Store results:
mpijob_time = mpijob_run.status.results['time']
mpijob_result = mpijob_run.status.results['result']
mpijob_array = np.array(mpijob_run.artifact('array').as_df()["0"])

## 6. Compare Runtimes

1. Print a summary message.
2. Verify that:
  * The mpijob run took less time (only in stronger machines). 
  * The result value is equal between the runs.
  * The array values are equal between the runs.

In [None]:
# Delete the project and data in S3:
s3_client.delete(
    bucket=S3_BUCKET,
    s3_path=S3_PROJECT_DIRECTORY,
)

# Delete the MLRun project:
mlrun.get_run_db().delete_project(name=project.name, deletion_strategy="cascading")

In [None]:
# Print the test's collected results:
print(
    f"Job:\n" 
    f"\t{'%.2f' % job_time} Seconds\n"
    f"\tResult: {job_result}"
)
print(
    f"Open MPI Job:\n"
    f"\t{'%.2f' % mpijob_time} Seconds\n"
    f"\tResult: {mpijob_result}\n"
)

# Verification:
# assert mpijob_time < job_time  # Only possible to test on a stronger machine as the test requires big data.
assert np.isclose(job_result, mpijob_result)
assert all(job_array == mpijob_array)