In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-31-pandas")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [9]:
import pandas as pd
from IPython.display import display

In [5]:
# Pandas dataframe is created with columnar data
# Spark dataframe is created with row data
df = pd.DataFrame({"first":range(10), "second":range(5,15)})
df.head()

Unnamed: 0,first,second
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


### convert Pandas DataFrame to Spark DataFrame

`sparkDF = spark.createDataFrame(df)`

In [6]:
sparkDF = spark.createDataFrame(df)
sparkDF.show(5)

+-----+------+
|first|second|
+-----+------+
|    0|     5|
|    1|     6|
|    2|     7|
|    3|     8|
|    4|     9|
+-----+------+
only showing top 5 rows



### convert Pandas DataFrame to Python

`obj = df.to_dict()`

`obj = df.to_list()`

In [7]:
obj = df.to_dict()

In [8]:
obj

{'first': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9},
 'second': {0: 5, 1: 6, 2: 7, 3: 8, 4: 9, 5: 10, 6: 11, 7: 12, 8: 13, 9: 14}}

#### Why xarray

https://xarray.pydata.org/en/stable/why-xarray.html

In [17]:
!pip install xarray

Collecting xarray
  Downloading xarray-0.17.0-py3-none-any.whl (759 kB)
[K     |████████████████████████████████| 759 kB 2.2 MB/s eta 0:00:01
Installing collected packages: xarray
Successfully installed xarray-0.17.0


In [19]:
obj = df.to_xarray()
obj

In [21]:
obj.first[:5]

In [22]:
narray = df.to_numpy()

In [24]:
narray[:5]

array([[ 0, 50],
       [ 1, 51],
       [ 2, 52],
       [ 3, 53],
       [ 4, 54]])

### convert Spark DataFrame to Pandas DataFrame

`pandasDF = sparkDF.toPandas()`

In [10]:
pandasDF = sparkDF.toPandas()
pandasDF.head()

Unnamed: 0,first,second
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


In [11]:
display(pandasDF)

Unnamed: 0,first,second
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9
5,5,10
6,6,11
7,7,12
8,8,13
9,9,14


### convert Spark DataFrame to Python

`dfobj = sparkDF.collect()`

In [12]:
dfobj = sparkDF.collect()
dfobj[:5]

[Row(first=0, second=5),
 Row(first=1, second=6),
 Row(first=2, second=7),
 Row(first=3, second=8),
 Row(first=4, second=9)]