### 1. Grouped MAP
- 1. split the data into groups by using df.groupBy
- 2. Apply a func on each gp. The input and output of the func are both pd.df.
- 3. Combine the results into a new DF.

In [0]:
import pyspark.pandas as ps
df = ps.DataFrame(
    {
        "id" : [1,1,2,2,2],
        "v" : [1.0,2.0,3.0,5.0,10.0]
    }
)
sdf = df.to_spark()
sdf.show()

In [0]:
sdf.schema

In [0]:
def subtract_mean(pdf): # pdf is a pandas DataFrame
    v = pdf.v
    return pdf.assign(v = v - v.mean())

In [0]:
sdf.groupBy("id").applyInPandas(subtract_mean,schema="id long, v double").show()

### 2. Map

In [0]:
df = ps.DataFrame({
    "id" : [1,2],
    "age": [21,30]
})
sdf = df.to_spark()
sdf.show()

In [0]:
# Defining a Custom Filtering Function
def filter_func(iterator):
    for pdf in iterator:
        pdf_new = pdf.copy()
        pdf_new['id'] = pdf_new['id'] + 1
        yield pd.concat([pdf, pdf_new])

In [0]:
sdf.mapInPandas(filter_func,schema=sdf.schema).show()

### 3. Cogrouped Map

In [0]:
import pandas as pd
df1 = ps.DataFrame({
    "time" : [20000101,20000101,20000102,20000102],
    "id" : [1,2,1,2],
    "v1" : [1.0,2.0,3.0,4.0]
})


df2 = ps.DataFrame({
    "time" : [20000101,20000101],
    "id" : [1,2],
    "v2" : ["x","y"]
})


sdf1 =  df1.to_spark()
sdf2 = df2.to_spark()

In [0]:
sdf1.show()

In [0]:
sdf2.show()

In [0]:
def asof_join(l,r):
    return pd.merge_asof(l,r,on="time",by = "id")

In [0]:
sdf1.groupby("id").cogroup(sdf2.groupby("id")).applyInPandas(
    asof_join, schema="time int, id int, v1 double, v2 string").show()