In [0]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps

In [0]:
pser = pd.Series([1, 2, 3, 4, 5, np.nan])

In [0]:
print(pser)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64


In [0]:
type(pser)

pandas.core.series.Series

In [0]:
# Create a pandas on spark series
pser_pd = ps.Series([1, 2, 3,np.nan])

In [0]:
print(pser_pd)

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64


In [0]:
type(pser_pd)

pyspark.pandas.series.Series

In [0]:
pser_ps = ps.from_pandas(pser)

In [0]:
print(pser_ps)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64


In [0]:
type(pser_ps)

pyspark.pandas.series.Series

In [0]:
dictionary = {"A": np.random.rand(5), "B":np.random.rand(5)}

In [0]:
print(dictionary)

{'A': array([0.67778049, 0.82118051, 0.06217695, 0.49607286, 0.52133782]), 'B': array([0.1741511 , 0.88020921, 0.22750544, 0.90217351, 0.60728249])}


In [0]:
pdf = pd.DataFrame.from_dict(dictionary)

In [0]:
print(pdf)

          A         B
0  0.677780  0.174151
1  0.821181  0.880209
2  0.062177  0.227505
3  0.496073  0.902174
4  0.521338  0.607282


In [0]:
psdf = ps.DataFrame.from_dict(dictionary)

In [0]:
psdf.describe()

Unnamed: 0,A,B
count,5.0,5.0
mean,0.51571,0.558264
std,0.285346,0.346868
min,0.062177,0.174151
25%,0.496073,0.227505
50%,0.521338,0.607282
75%,0.67778,0.880209
max,0.821181,0.902174


In [0]:
psdf.sort_values(by='B')

Unnamed: 0,A,B
0,0.67778,0.174151
2,0.062177,0.227505
4,0.521338,0.607282
1,0.821181,0.880209
3,0.496073,0.902174


In [0]:
psdf

Unnamed: 0,A,B
0,0.67778,0.174151
1,0.821181,0.880209
2,0.062177,0.227505
3,0.496073,0.902174
4,0.521338,0.607282


In [0]:
psdf.transpose()

Unnamed: 0,0,1,2,3,4
A,0.67778,0.821181,0.062177,0.496073,0.521338
B,0.174151,0.880209,0.227505,0.902174,0.607282


In [0]:
ps.get_option('compute.max_rows')

1000

In [0]:
psdf

Unnamed: 0,A,B
0,0.67778,0.174151
1,0.821181,0.880209
2,0.062177,0.227505
3,0.496073,0.902174
4,0.521338,0.607282


In [0]:
psdf.A

0    0.677780
1    0.821181
2    0.062177
3    0.496073
4    0.521338
Name: A, dtype: float64

In [0]:
psdf.loc[0:4]

Unnamed: 0,A,B
0,0.67778,0.174151
1,0.821181,0.880209
2,0.062177,0.227505
3,0.496073,0.902174
4,0.521338,0.607282


In [0]:
psdf.iloc[2:4]

Unnamed: 0,A,B
2,0.062177,0.227505
3,0.496073,0.902174


In [0]:
psdf.apply(np.cumsum,axis=1)

Unnamed: 0,A,B
0,0.67778,0.851932
1,0.821181,1.70139
2,0.062177,0.289682
3,0.496073,1.398246
4,0.521338,1.12862


In [0]:

def squared(x) -> ps.Series[np.float64]:
    return x**2

In [0]:
psdf.apply(squared)

Unnamed: 0,A,B
0,0.459386,0.030329
1,0.674337,0.774768
2,0.003866,0.051759
3,0.246088,0.813917
4,0.271793,0.368792


### Grouping data

In [0]:
psdf = ps.DataFrame(
    {
        "A": [1,2,3,4,5],
        "B": [10,20,30,40,50],
        "C": [5,7,8,9,10]
    }
)

In [0]:
psdf.groupby("A").sum()

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10,5
2,20,7
3,30,8
5,50,10
4,40,9


In [0]:
psdf.groupby(["A","B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C
A,B,Unnamed: 2_level_1
1,10,5
2,20,7
3,30,8
5,50,10
4,40,9


### User defined functions


In [0]:
import pandas as pd


In [0]:
def multiply(a:pd.Series,b:pd.Series) -> pd.Series:
    return a*b
    

In [0]:
x = pd.Series([2,3,4])
y = pd.Series([5,6,7])

In [0]:
multiply(x,y)

0    10
1    18
2    28
dtype: int64

### Create a pandas udf that computes the product of 2 columns

In [0]:
import pandas as pd
from pyspark.sql.functions import col,pandas_udf
from pyspark.sql.types import LongType

In [0]:
def multiply_func (a:pd.Series,b:pd.Series) -> pd.Series:
    return a*b

In [0]:
multiply = pandas_udf(multiply_func, returnType=LongType())

In [0]:
multiply_func(x,x)

0     4
1     9
2    16
dtype: int64

In [0]:
df = spark.createDataFrame(pd.DataFrame(x,columns = ["x"]))

In [0]:
df.show()

+---+
|  x|
+---+
|  2|
|  3|
|  4|
+---+



In [0]:
df.select(multiply(df.x,df.x)).show()

+-------------------+
|multiply_func(x, x)|
+-------------------+
|                  4|
|                  9|
|                 16|
+-------------------+

