# Excercises

Let's Explore Smallpond Functionalities

## Imports

In [None]:
import ray
import pandas as pd
import pyarrow.parquet as pq
from smallpond import init as smallpond_init

## Initialize everything (if running locally)

In [None]:
# Get's the data from the parquet file (in your bucket) using PyArrow
a_table = pq.read_table("s3://my-data/data/sample.parquet")

# Initialize Ray
ray.init(address="ray://localhost:10001")

# Smallpond is pointing the ray head at ray://localhost:10001
sp = smallpond_init(job_name="ProcessingTest", ray_address="ray://localhost:10001", data_root="data", num_executors=2, bind_numa_node=False, executor_resources={"CPU": 4,"memory": 8,"object_store_memory": 2*1024})


# Pandas equivalents

Read a parquet File

In [None]:
#Pandas
df = pd.read_parquet("sample.parquet")
df

In [None]:
#Smallpond
sdf = sp.from_arrow(a_table)
print(sdf.take(10))

Convert a PyArrow df to pandas (could be a result from smallpond)

In [None]:
sdf.to_pandas()

Save a Data Frame to a Parquet file

In [None]:
#Pandas
df.to_parquet(output_path="new.parquet")

In [None]:
#Smallpond
sdf.write_parquet("new_sp.parquet")

Filter conditions

In [None]:
# Pandas
filtered_df = df[df["model"] == "ST4000DM000"]
filtered_df.head()

In [None]:
# Smallpond filter

filtered_sdf = sdf.filter("model = 'ST4000DM000'")
print(filtered_sdf.take(10))

Mapped execution/ Apply

In [None]:
#Pandas
df["total_1_3"] = df["smart_1_normalized"] + df["smart_3_normalized"]
df["total_1_3"]

In [None]:
# Smallpond
sdf = sdf.map("smart_1_normalized + smart_3_normalized as total_1_3")
print(sdf.to_pandas()["total_1_3"])

# Smallpond Specifics

Partial SQL Query, Executes the query over the distributed data

In [None]:
results_df = sp.partial_sql("SELECT datacenter, SUM(smart_1_normalized) FROM {0} GROUP BY datacenter", sdf)
results_df.take(10)

Repartitioning output

In [None]:
num_partitions = 5
sdf = sdf.repartition(num_partitions, hash_by="datacenter")
sdf.write_parquet("new_sp_directory")

## ShutDown Ray

In [None]:
ray.shutdown()