#Belajar Pyspark - Membaca File csv

Dalam notebook ini kita akan belajar tentang bagaimana membaca file csv ke dalam dataframe, beserta penerapan beberapa parameternya.

In [None]:
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
spark = SparkSession.builder.appName("Belajar PySpark - Membaca file csv").getOrCreate()

##Membaca File csv Tanpa Header

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/mhs.csv

File csv tanpa header

In [None]:
!cat mhs.csv

Secara default, spark.read.csv membaca file tanpa header

In [None]:
df = spark.read.csv("mhs.csv")
df.printSchema()

In [None]:
df.show()

In [None]:
df = spark.read.format("csv").load("mhs.csv")
df.show()

##Membaca File csv Dengan Header

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/mhs_header.csv

In [None]:
df1 = spark.read.csv("mhs_header.csv", header=True)
df1.printSchema()

In [None]:
df1.show()

In [None]:
df1 = spark.read \
          .option("header",True) \
          .csv("mhs_header.csv")
df1.printSchema()

In [None]:
df1 = spark.read \
          .format("csv") \
          .option("header",True) \
          .load("mhs_header.csv")
df1.printSchema()

##Menentukan Tipe Kolom Secara Otomatis

In [None]:
df3 = spark.read.csv("mhs_header.csv", header=True, inferSchema=True)
df3.show()
df3.printSchema()

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/mhs_notclean.csv

In [None]:
df = spark.read.csv("mhs_notclean.csv",
                    header=True,
                    inferSchema=True)
df.show()
df.printSchema()

In [None]:
df = spark.read.option("header",True) \
      .option("inferSchema",True) \
      .format("csv") \
      .load("mhs.csv")

df.printSchema()

##Membaca File Dengan Delimiter Selain Koma

###Tab delimited file

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/mhs.tsv

In [None]:
df3 = spark.read.csv("mhs.tsv", sep='\t', header=True, inferSchema=True)
df3.printSchema()

In [None]:
df = spark.read.option("header",True) \
      .option("inferSchema",True) \
      .option("sep", "\t") \
      .csv("mhs.tsv")
df.show()
df.printSchema()

##Mengganti Nilai Tertentu Dengan NULL

In [None]:
df = spark.read.csv("mhs_notclean.csv",
                    header=True,
                    inferSchema=True,
                    nullValue="-")
df.show()
df.printSchema()

##Tanda Quote Untuk Escape Kolom

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/mhs_quote.csv

In [None]:
df = spark.read.csv("mhs_quote.csv",
                    header=True,
                    inferSchema=True,
                    nullValue="-",
                    quote="'")
df.show()
df.printSchema()