<a href="https://colab.research.google.com/github/vaniamv/dataprocessing/blob/main/spark/examples/02-dataframes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/02-dataframes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataframe
-

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

# DataFrames
- Untyped Datasets
- Similar to tables in relational databases
- DataFrames are just Dataset of Rows in Scala and Java API. These operations are also referred as “untyped transformations” in contrast to “typed transformations” come with strongly typed Scala/Java Datasets.

In [3]:
# Creating from RDDs
# .toDF()

lst = [("c1", "v1"), ("c2", "v2"), ("c3", "v3")]
rdd = sc.parallelize(lst)
df = rdd.toDF(["col", "value"])
df.show()



+---+-----+
|col|value|
+---+-----+
| c1|   v1|
| c2|   v2|
| c3|   v3|
+---+-----+



In [4]:
# Using "createDataFrame" + StructTypes

from pyspark.sql.types import *

data = [("c1", "v1"), ("c2", "v2"), ("c3", "v3")]

schema = StructType([
    StructField("col", StringType(), True),
    StructField("value", StringType(), True)
])

df = spark.createDataFrame(data, schema=schema)
df.show()


+---+-----+
|col|value|
+---+-----+
| c1|   v1|
| c2|   v2|
| c3|   v3|
+---+-----+



In [None]:
# Read data from data sources
https://spark.apache.org/docs/3.5.2/sql-data-sources.html


In [6]:
!mkdir -p /content/files/samples/

In [7]:
# from csv
import csv

# creating csv file
with open('/content/files/samples/file.csv', 'w', newline='') as csvfile:
    fieldnames = ['col', 'value']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=";")
    writer.writeheader()
    writer.writerow({'col': 'c1', 'value': 'v1'})
    writer.writerow({'col': 'c2', 'value': 'v2'})
    writer.writerow({'col': 'c3', 'value': 'v3'})

# read csv file
df = spark.read.format("csv").load("/content/files/samples/file.csv", sep=";", header=True)
df.show()

+---+-----+
|col|value|
+---+-----+
| c1|   v1|
| c2|   v2|
| c3|   v3|
+---+-----+



In [8]:

# from json
json = """[{"col": "c1", "value": "v1"}, {"col": "c2", "value": "v2"}, {"col": "c3", "value": "v3"}]"""

text_file = open("/content/files/samples/file.json", "w")
text_file.write(json)
text_file.close()

# read from json
df = spark.read.json("/content/files/samples/file.json")
df.show()

+---+-----+
|col|value|
+---+-----+
| c1|   v1|
| c2|   v2|
| c3|   v3|
+---+-----+



In [9]:
# from parquet

# Generating parquet
lst = [("c1", "v1"), ("c2", "v2"), ("c3", "v3")]
rdd = sc.parallelize(lst)
df = rdd.toDF(["col", "value"])
df.write.format("parquet").mode("overwrite").save("/content/files/samples/parquet")


In [10]:
# read from parquet
df2 = spark.read.format("parquet").load("/content/files/samples/parquet")
df2.show()

+---+-----+
|col|value|
+---+-----+
| c1|   v1|
| c2|   v2|
| c3|   v3|
+---+-----+



In [None]:
# Check schema
df.printSchema()
print(df.schema)
print(df.columns)

root
 |-- col: string (nullable = true)
 |-- value: string (nullable = true)

StructType([StructField('col', StringType(), True), StructField('value', StringType(), True)])
['col', 'value']


In [None]:
# counting items in the dataframe
print(df.count())
print(df.take(5))
df.head(5)

3
[Row(col='c1', value='v1'), Row(col='c2', value='v2'), Row(col='c3', value='v3')]


[Row(col='c1', value='v1'),
 Row(col='c2', value='v2'),
 Row(col='c3', value='v3')]

In [None]:
# check explain plan
df.explain("cost")

== Physical Plan ==
*(1) Scan ExistingRDD[col#612,value#613]




In [None]:
df.toJSON().first()

'{"col":"c1","value":"v1"}'

In [None]:
df.toPandas()

Unnamed: 0,col,value
0,c1,v1
1,c2,v2
2,c3,v3


In [None]:
# createOrReplaceTempView
df.createOrReplaceTempView("my_table")
spark.sql("select * from my_table").show()

# createOrReplaceGlobalTempView
df.createOrReplaceGlobalTempView("my_table2")
spark.sql("select * from globaL_temp.my_table2").show()


+---+-----+
|col|value|
+---+-----+
| c1|   v1|
| c2|   v2|
| c3|   v3|
+---+-----+



## write operations

- df.write.format("parquet").save(path)
- df.write.format("json").save(path)
- df.write.format("csv").save(path)

### if using delta.io
- df.write.format("delta").saveAsTable(table_name)

