# Example - Scientific Computing - Multiple Players

In [13]:
import argparse
import logging
import unittest

import numpy as np
import pandas as pd

from pymoose import edsl
from pymoose.logger import get_logger
from pymoose.testing import LocalMooseRuntime


np.random.seed(1234)
FIXED = edsl.fixed(14, 23)


**Use case:**
- Researchers wants to measure the correlation between alcohol consumption and student grades but the data is siloed between the department of public health and the department of education:

**Datasets**:

Department of Public Health:
  - Alcohol consumption

Department of Education
  - Students grades

In [14]:
def generate_correlated_data(n_samples, column_names):
    mu = np.array([10, 0])
    r = np.array([
            [  3.40, -2.75],
            [ -2.75,  5.50],
        ])
    rng = np.random.default_rng(12)
    x = rng.multivariate_normal(mu, r, size=n_samples)
    df0 = pd.DataFrame(data=x[:, 0], columns=[column_names[0]])
    df1 = pd.DataFrame(data=x[:, 1], columns=[column_names[1]])
    return df0, df1

In [15]:
alcohol_consumption, grades = generate_correlated_data(100, ["alcohol_consumption", "grades"])

alcohol_consumption.head()

Unnamed: 0,alcohol_consumption
0,11.068034
1,9.588196
2,6.284987
3,9.631837
4,11.175781


In [16]:
grades.head()

Unnamed: 0,grades
0,0.712905
1,2.164735
2,2.786134
3,-2.323364
4,0.4539


**Computation:**
- We want to compute the correlation between alcohol consumption and students grades currently siloed between the Department of Public Health and the Department of Education.

In [17]:
def correlation_computation():
    pub_health_dpt = edsl.host_placement(name="pub_health_dpt")
    education_dpt = edsl.host_placement(name="education_dpt")
    data_scientist = edsl.host_placement(name="data_scientist")
    encrypted_governement = edsl.replicated_placement(name="encrypted_governement", players=[pub_health_dpt, education_dpt, data_scientist])

    def corr(x, y):
        x_mean = edsl.mean(x, 0)
        y_mean = edsl.mean(y, 0)
        stdv_x = edsl.sum(edsl.square(edsl.sub(x, x_mean)))
        stdv_y = edsl.sum(edsl.square(edsl.sub(y, y_mean)))
        corr_num = edsl.sum(edsl.mul(edsl.sub(x, x_mean), edsl.sub(y, y_mean)))
        corr_denom = edsl.mul(stdv_x, stdv_y)
        return corr_num, corr_denom

    # TODO use AES encrytion (optional)
    # TODO Implement square root

    @edsl.computation
    def my_comp(alcohol: edsl.Argument(pub_health_dpt, vtype=edsl.TensorType(edsl.float64)),
                grades: edsl.Argument(education_dpt, vtype=edsl.TensorType(edsl.float64))):
                
        with pub_health_dpt:
            alcohol = edsl.cast(alcohol, dtype=FIXED)

        with education_dpt:
            grades = edsl.cast(grades, dtype=FIXED)
            
        with encrypted_governement:
            corr_num, corr_denom = corr(alcohol, grades)
            corr_denom = edsl.sqrt(corr_denom)
            corr = edsl.div(corr_num, corr_denom)

        with data_scientist:
            correlation = edsl.cast(correlation, dtype=edsl.float64)
            correlation = edsl.save("correlation", correlation)

        return correlation

    return my_comp


In [18]:
def execute_computation(input, computation):
    executors_storage = {
                "pub_health_dpt": {},
                "education_dpt": {},
                "data_scientist": {},
            }
    runtime = LocalMooseRuntime(storage_mapping=executors_storage)

    logical_comp = edsl.trace(computation)
    runtime.evaluate_computation(
        computation=logical_comp,
        role_assignment={
            "pub_health_dpt": "pub_health_dpt",
            "education_dpt": "education_dpt",
            "data_scientist": "data_scientist",
        },
        arguments={"alcohol": alcohol_consumption.to_numpy(), "grades": grades.to_numpy()},
    )

    return runtime

In [19]:
computation = correlation_computation()
results = execute_computation(input, computation)
correlation = results.read_value_from_storage("data_scientist", "correlation")

UnboundLocalError: local variable 'correlation' referenced before assignment

**Results:**
- The correlation between alcohol consumption and grades is:

In [11]:
pd.DataFrame(correlation.reshape(1, 1), columns=["correlation"])

Unnamed: 0,correlation
0,-0.548101


We can validate the computation on encrypted data matches the computation on plaintext data.

In [12]:
np.corrcoef(np.squeeze(alcohol_consumption.to_numpy()), np.squeeze(grades.to_numpy()))[1, 0]

-0.5481005967856092