In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-31-pandas")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame({"first":range(200), "second":range(50,250)})
df.head()

Unnamed: 0,first,second
0,0,50
1,1,51
2,2,52
3,3,53
4,4,54


### convert Pandas DataFrame to Spark DataFrame

`sparkDF = spark.createDataFrame(df)`

In [4]:
sparkDF = spark.createDataFrame(df)
sparkDF.show(5)

+-----+------+
|first|second|
+-----+------+
|    0|    50|
|    1|    51|
|    2|    52|
|    3|    53|
|    4|    54|
+-----+------+
only showing top 5 rows



### convert Pandas DataFrame to Python

`obj = df.to_dict()`

`obj = df.to_list()`

In [10]:
obj = df.to_dict()

In [14]:
obj

{'first': {0: 0,
  1: 1,
  2: 2,
  3: 3,
  4: 4,
  5: 5,
  6: 6,
  7: 7,
  8: 8,
  9: 9,
  10: 10,
  11: 11,
  12: 12,
  13: 13,
  14: 14,
  15: 15,
  16: 16,
  17: 17,
  18: 18,
  19: 19,
  20: 20,
  21: 21,
  22: 22,
  23: 23,
  24: 24,
  25: 25,
  26: 26,
  27: 27,
  28: 28,
  29: 29,
  30: 30,
  31: 31,
  32: 32,
  33: 33,
  34: 34,
  35: 35,
  36: 36,
  37: 37,
  38: 38,
  39: 39,
  40: 40,
  41: 41,
  42: 42,
  43: 43,
  44: 44,
  45: 45,
  46: 46,
  47: 47,
  48: 48,
  49: 49,
  50: 50,
  51: 51,
  52: 52,
  53: 53,
  54: 54,
  55: 55,
  56: 56,
  57: 57,
  58: 58,
  59: 59,
  60: 60,
  61: 61,
  62: 62,
  63: 63,
  64: 64,
  65: 65,
  66: 66,
  67: 67,
  68: 68,
  69: 69,
  70: 70,
  71: 71,
  72: 72,
  73: 73,
  74: 74,
  75: 75,
  76: 76,
  77: 77,
  78: 78,
  79: 79,
  80: 80,
  81: 81,
  82: 82,
  83: 83,
  84: 84,
  85: 85,
  86: 86,
  87: 87,
  88: 88,
  89: 89,
  90: 90,
  91: 91,
  92: 92,
  93: 93,
  94: 94,
  95: 95,
  96: 96,
  97: 97,
  98: 98,
  99: 99,
  100: 100,

#### Why xarray

https://xarray.pydata.org/en/stable/why-xarray.html

In [17]:
!pip install xarray

Collecting xarray
  Downloading xarray-0.17.0-py3-none-any.whl (759 kB)
[K     |████████████████████████████████| 759 kB 2.2 MB/s eta 0:00:01
Installing collected packages: xarray
Successfully installed xarray-0.17.0


In [19]:
obj = df.to_xarray()
obj

In [21]:
obj.first[:5]

In [22]:
narray = df.to_numpy()

In [24]:
narray[:5]

array([[ 0, 50],
       [ 1, 51],
       [ 2, 52],
       [ 3, 53],
       [ 4, 54]])

### convert Spark DataFrame to Pandas DataFrame

`pandasDF = sparkDF.toPandas()`

In [5]:
pandasDF = sparkDF.toPandas()
pandasDF.head()

Unnamed: 0,first,second
0,0,50
1,1,51
2,2,52
3,3,53
4,4,54


### convert Spark DataFrame to Python

`dfobj = sparkDF.collect()`

In [8]:
dfobj = sparkDF.collect()
dfobj[:5]

[Row(first=0, second=50),
 Row(first=1, second=51),
 Row(first=2, second=52),
 Row(first=3, second=53),
 Row(first=4, second=54)]