### 1. Grouped MAP
- 1. split the data into groups by using df.groupBy
- 2. Apply a func on each gp. The input and output of the func are both pd.df.
- 3. Combine the results into a new DF.

In [9]:
import pyspark.pandas as ps
df = ps.DataFrame(
    {
        "id" : [1,1,2,2,2],
        "v" : [1.0,2.0,3.0,5.0,10.0]
    }
)
sdf = df.to_spark()
sdf.show()



+---+----+
| id|   v|
+---+----+
|  1| 1.0|
|  1| 2.0|
|  2| 3.0|
|  2| 5.0|
|  2|10.0|
+---+----+



In [12]:
sdf.schema

StructType([StructField('id', LongType(), False), StructField('v', DoubleType(), False)])

In [10]:
def subtract_mean(pdf): # pdf is a pandas DataFrame
    v = pdf.v
    return pdf.assign(v = v - v.mean())

In [11]:
sdf.groupBy("id").applyInPandas(subtract_mean,schema="id long, v double").show()

[Stage 8:>                                                          (0 + 1) / 1]

+---+----+
| id|   v|
+---+----+
|  1|-0.5|
|  1| 0.5|
|  2|-3.0|
|  2|-1.0|
|  2| 4.0|
+---+----+



                                                                                

### 2. Map

In [13]:
df = ps.DataFrame({
    "id" : [1,2],
    "age": [21,30]
})
sdf = df.to_spark()
sdf.show()



+---+---+
| id|age|
+---+---+
|  1| 21|
|  2| 30|
+---+---+



In [14]:
# Defining a Custom Filtering Function
def filter_func(iterator):
    for pdf  in iterator:
        yield pdf[pdf.id == 1]

In [15]:
sdf.mapInPandas(filter_func,schema=sdf.schema).show()

+---+---+
| id|age|
+---+---+
|  1| 21|
+---+---+



### 3. Cogrouped Map

In [20]:
import pandas as pd
df1 = ps.DataFrame({
    "time" : [20000101,20000101,20000102,20000102],
    "id" : [1,2,1,2],
    "v1" : [1.0,2.0,3.0,4.0]
})


df2 = ps.DataFrame({
    "time" : [20000101,20000101],
    "id" : [1,2],
    "v2" : ["x","y"]
})


sdf1 =  df1.to_spark()
sdf2 = df2.to_spark()



In [21]:
sdf1.show()

+--------+---+---+
|    time| id| v1|
+--------+---+---+
|20000101|  1|1.0|
|20000101|  2|2.0|
|20000102|  1|3.0|
|20000102|  2|4.0|
+--------+---+---+



In [25]:
sdf2.show()

+--------+---+---+
|    time| id| v2|
+--------+---+---+
|20000101|  1|  x|
|20000101|  2|  y|
+--------+---+---+



In [30]:
def asof_join(l,r):
    return pd.merge_asof(l,r,on="time",by = "id")

In [31]:
sdf1.groupby("id").cogroup(sdf2.groupby("id")).applyInPandas(
    asof_join, schema="time int, id int, v1 double, v2 string").show()

+--------+---+---+---+
|    time| id| v1| v2|
+--------+---+---+---+
|20000101|  1|1.0|  x|
|20000102|  1|3.0|  x|
|20000101|  2|2.0|  y|
|20000102|  2|4.0|  y|
+--------+---+---+---+

