# Using the Session Builder to Answer Private Join Queries

## Import Libraries

In [None]:
from pyspark.sql import SparkSession

from tmlt.analytics.privacy_budget import RhoZCDPBudget
from tmlt.analytics.session import Session
from tmlt.analytics.query_builder import QueryBuilder
from tmlt.analytics.truncation_strategy import TruncationStrategy

## Build Session

In [None]:
spark = SparkSession.builder.getOrCreate()

left_df = spark.createDataFrame(
    [(1, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C"), (3, "C")], schema=["X", "Y"],
)

right_df = spark.createDataFrame(
    [("A", 4), ("A", 5), ("B", 4), ("C", 4), ("C", 5),], schema=["Y", "Z"],
)

In [None]:
session = ( 
    Session.Builder()
    .with_privacy_budget(privacy_budget=RhoZCDPBudget(float("inf")))
    .with_private_dataframe(source_id="left", dataframe=left_df, stability=1)
    .with_private_dataframe(source_id="right", dataframe=right_df, stability=1)
    .build()
)

## Private Join Queries
Using budget=inf to illustrate the truncation methods.

Truncate both tables by dropping all records with duplicate join keys.

In [None]:
query1 = QueryBuilder("left").join_private(
    right_operand=QueryBuilder("right"),
    truncation_strategy_left=TruncationStrategy.DropNonUnique(),
    truncation_strategy_right=TruncationStrategy.DropNonUnique(),
).count()

answer = session.evaluate(query1, RhoZCDPBudget(float("inf")))
answer.show()

Truncate both tables with threshold = 1, dropping records above that threshold.

In [None]:
query2 = QueryBuilder("left").join_private(
    right_operand=QueryBuilder("right"),
    truncation_strategy_left=TruncationStrategy.DropExcess(1),
    truncation_strategy_right=TruncationStrategy.DropExcess(1),
).count()

answer = session.evaluate(query2, RhoZCDPBudget(float("inf")))
answer.show()

Truncate both tables with left threshold = 2 and right threshold = 1, dropping records above that threshold.

In [None]:
query4 = QueryBuilder("left").join_private(
    right_operand=QueryBuilder("right"),
    truncation_strategy_left=TruncationStrategy.DropExcess(2),
    truncation_strategy_right=TruncationStrategy.DropExcess(1),
    join_columns=["Y"]
).count()

answer = session.evaluate(query4, RhoZCDPBudget(float("inf")))
answer.show()

Truncate both tables, with left threshold = 2 dropping records above that threshold, and dropping all records with duplicate join keys in the right table.

In [None]:
query5 = QueryBuilder("left").join_private(
    right_operand=QueryBuilder("right"),
    truncation_strategy_left=TruncationStrategy.DropExcess(2),
    truncation_strategy_right=TruncationStrategy.DropNonUnique(),
    join_columns=["Y"]
).count()

answer = session.evaluate(query5, RhoZCDPBudget(float("inf")))
answer.show()