In [0]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

In [0]:
# pandas series methods
def multiply_func(a : pd.Series, b: pd.Series) -> pd.Series:
    return a * b

x = pd.Series([1,2,3])
print(multiply_func(x,x))

In [0]:
import pyspark.pandas as ps
# Create a Spark DataFrame

sdf = ps.DataFrame(x, columns=["x"])
sdf

### Iterator of Series UDF

In [0]:
import pandas as pd
from typing import Iterator
from pyspark.sql.functions import col, pandas_udf, struct

In [0]:
pdf = pd.DataFrame([1,2,3], columns=["x"])
sdf = ps.DataFrame(pdf)
sdf = sdf.to_spark()
sdf

In [0]:
@pandas_udf("long")
def plus_one(batch_iter : Iterator[pd.Series]) -> Iterator[pd.Series]:
    for x in batch_iter:
        yield x+1

In [0]:
sdf.select(plus_one(col("x"))).show()

### Iterator of Multiple Series

from typing import Iterator,Tuple

@pandas_udf("long")
def multiply_two_cols(iterator : Iterator[Tuple[pd.Series,pd.Series]]) -> Iterator[pd.Series]:
    for a,b in iterator:
        yield a * b

In [0]:
from typing import Iterator,Tuple

@pandas_udf("long") 
def multiply_two_cols(iterator : Iterator[Tuple[pd.Series,pd.Series]]) -> Iterator[pd.Series]:    
    for a, b in iterator:
        yield a * b


In [0]:
sdf.select(multiply_two_cols("x","x")).show()

### Series to  scaler UDF

from pyspark.sql import Window

In [0]:
spdf = ps.DataFrame({
    "id" : [1,1,1,2,2],
    "v" : [1.0,2.0,3.0,5.0,10.0]
})

sdf = spdf.to_spark()
sdf.show()

In [0]:
@pandas_udf("double")
def mean_udf(v : pd.Series) -> float:
    return v.mean()

In [0]:
sdf.select(mean_udf(sdf['v'])).show()

In [0]:
sdf.groupby("id").agg(mean_udf(sdf["v"])).show()

In [0]:
w = Window.partitionBy("id").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
sdf.withColumn("mean_v", mean_udf(sdf["v"]).over(w)).show()