In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.master("local").appName("PandastoSpark").getOrCreate()
sc = spark.sparkContext

In [23]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Pandas DF to spark DF
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
                .option("delimiter",",") \
                .csv(filepath + "IntPandasSpark.csv",inferSchema=True)
df.show()

+----------+------------+-------------------+-------+--------+------+
|CustomerId|CustomerName|           dateTime| Amount|discount|Member|
+----------+------------+-------------------+-------+--------+------+
|      1001|        Arun|2020-07-15 01:01:53|2465.22|    10 %|  true|
|      1005|      Barath|2020-07-13 12:15:33|8399.34|     5 %|  true|
|      1003|      Charle|2020-07-18 20:10:45|1234.88|     3 %| false|
|      1004|       Gokul|2020-07-15 11:11:36| 1690.0|     1 %|  true|
|      1005|        NULL|2020-07-18 15:11:43|  160.0|     3 %|  true|
|      1006|      Gerold|2020-07-08 14:16:53| 2546.0|     1 %|  true|
|      1007|      Parker|2020-07-04 17:13:33| 3456.0|     2 %| false|
|      1008|        Thor|2020-07-10 03:30:43| 8745.0|     5 %|  true|
|      1009|       Steve|2020-07-22 12:10:43|  143.0|     2 %|  true|
|      1010|        Mani|2020-07-27 19:40:23| 1865.0|     3 %|  true|
|      1011|      Cooper|2020-07-13 18:10:33| 1200.0|     1 %|  true|
|      1012|       P

In [24]:
import pandas as pd
pandasdf = pd.read_csv(filepath + "IntPandasSpark.csv")
pandasdf.head()

Unnamed: 0,CustomerId,CustomerName,dateTime,Amount,discount,Member
0,1001,Arun,2020-07-15 01:01:53,2465.22,10 %,True
1,1005,Barath,2020-07-13 12:15:33,8399.34,5 %,True
2,1003,Charle,2020-07-18 20:10:45,1234.88,3 %,False
3,1004,Gokul,2020-07-15 11:11:36,1690.0,1 %,True
4,1005,,2020-07-18 15:11:43,160.0,3 %,True


In [25]:
pandasdf.dtypes

CustomerId        int64
CustomerName     object
dateTime         object
Amount          float64
discount         object
Member             bool
dtype: object

In [14]:
#ApacheArrow is a incolumnar data format used to transfer data between jvm and python processes
#topandas (from spark to pandas) will use driver memory to collect in to a single a driver - overhead. 
#should work on sample database conversion not whole spark dataframe conversion

#Optimize conversion using ApacheArrow
spark.conf.get("spark.sql.execution.arrow.enabled")

'true'

In [15]:
spark.conf.get("spark.sql.execution.arrow.fallback.enabled")

'true'

In [16]:
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

In [32]:
#conversion Issues
#cannot merge StringType() to DoubleType()
sparkdf = spark.createDataFrame(pandasdf)
sparkdf.printSchema()

root
 |-- CustomerId: long (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- dateTime: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- discount: string (nullable = true)
 |-- Member: boolean (nullable = true)



In [27]:
sparkdf.show()

+----------+------------+-------------------+-------+--------+------+
|CustomerId|CustomerName|           dateTime| Amount|discount|Member|
+----------+------------+-------------------+-------+--------+------+
|      1001|        Arun|2020-07-15 01:01:53|2465.22|    10 %|  true|
|      1005|      Barath|2020-07-13 12:15:33|8399.34|     5 %|  true|
|      1003|      Charle|2020-07-18 20:10:45|1234.88|     3 %| false|
|      1004|       Gokul|2020-07-15 11:11:36| 1690.0|     1 %|  true|
|      1005|        NULL|2020-07-18 15:11:43|  160.0|     3 %|  true|
|      1006|      Gerold|2020-07-08 14:16:53| 2546.0|     1 %|  true|
|      1007|      Parker|2020-07-04 17:13:33| 3456.0|     2 %| false|
|      1008|        Thor|2020-07-10 03:30:43| 8745.0|     5 %|  true|
|      1009|       Steve|2020-07-22 12:10:43|  143.0|     2 %|  true|
|      1010|        Mani|2020-07-27 19:40:23| 1865.0|     3 %|  true|
|      1011|      Cooper|2020-07-13 18:10:33| 1200.0|     1 %|  true|
|      1012|       P

In [34]:
#Method1 - During conversion type astype(Str)
df1=spark.createDataFrame(pandasdf.astype("str"))
df1.printSchema()

root
 |-- CustomerId: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- dateTime: string (nullable = true)
 |-- Amount: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- Member: string (nullable = true)



In [40]:
#Method2 - Define StructType
schema = StructType([
    StructField("CustomerId",IntegerType(),True),
    StructField("CustomerName",StringType(),True),
    StructField("Datetime",StringType(),True),
    StructField("Amount",FloatType(),True),
    StructField("Discount",StringType(),True),
    StructField("Member",BooleanType(),True)
])
df2 = spark.createDataFrame(pandasdf,schema=schema)
df2.printSchema()

root
 |-- CustomerId: integer (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- Amount: float (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Member: boolean (nullable = true)



In [48]:
#convert spark dataframe to pandas dataframe
pandasdf2 = df2.toPandas()
pandasdf2.head()

Unnamed: 0,CustomerId,CustomerName,Datetime,Amount,Discount,Member
0,1001,Arun,2020-07-15 01:01:53,2465.219971,10 %,True
1,1005,Barath,2020-07-13 12:15:33,8399.339844,5 %,True
2,1003,Charle,2020-07-18 20:10:45,1234.880005,3 %,False
3,1004,Gokul,2020-07-15 11:11:36,1690.0,1 %,True
4,1005,,2020-07-18 15:11:43,160.0,3 %,True


In [49]:
pandasdf2.dtypes

CustomerId        int32
CustomerName     object
Datetime         object
Amount          float32
Discount         object
Member             bool
dtype: object

In [50]:
type(pandasdf2)

pandas.core.frame.DataFrame