#Belajar Pyspark - Menulis DataFrame ke File csv

Dalam notebook ini kita akan belajar tentang bagaimana menyimpan dataframe ke file csv, beserta penerapan beberapa parameternya.

In [1]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=e66ac09a0beabf2e7a4344bcefbd6297d4c66c9d58a082a7aaef5565327a2e53
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
spark = SparkSession.builder.appName("Belajar PySpark - Menulis file csv").getOrCreate()

In [4]:
data = [['Agus','Fisika','Umum',150],['Windy','Fisika','Khusus',200],
        ['Budi','Biologi','Umum',170],['Dina','Fisika','Khusus',180],
        ['Bayu','Fisika','Umum',160],['Dedi','Biologi','Khusus',185]]

kolom = ["nama","jurusan","jalur","nilai"]

df = spark.createDataFrame(data, kolom)

df.show()

+-----+-------+------+-----+
| nama|jurusan| jalur|nilai|
+-----+-------+------+-----+
| Agus| Fisika|  Umum|  150|
|Windy| Fisika|Khusus|  200|
| Budi|Biologi|  Umum|  170|
| Dina| Fisika|Khusus|  180|
| Bayu| Fisika|  Umum|  160|
| Dedi|Biologi|Khusus|  185|
+-----+-------+------+-----+



##Menulis dengan fungsi `write.csv`

In [5]:
!rm -rf mahasiswa

In [6]:
df.write.csv("mahasiswa")

In [7]:
!ls -l mahasiswa

total 8
-rw-r--r-- 1 root root 67 Feb 27 03:44 part-00000-176b0de0-7836-42ba-abe3-0547dd64a0db-c000.csv
-rw-r--r-- 1 root root 68 Feb 27 03:44 part-00001-176b0de0-7836-42ba-abe3-0547dd64a0db-c000.csv
-rw-r--r-- 1 root root  0 Feb 27 03:44 _SUCCESS


In [8]:
!head mahasiswa/part-00000-1135635c-7d3c-4495-939b-4edb9c1430e0-c000.csv

head: cannot open 'mahasiswa/part-00000-1135635c-7d3c-4495-939b-4edb9c1430e0-c000.csv' for reading: No such file or directory


In [9]:
!cat mahasiswa/*

Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185


###Menulis dengan header

In [10]:
!rm -rf mahasiswa_header

In [11]:
df.write.csv("mahasiswa_header", header=True)

In [12]:
!ls -l mahasiswa_header

total 8
-rw-r--r-- 1 root root 92 Feb 27 03:44 part-00000-3ed9465e-8dc7-450e-a6cb-574878c00c69-c000.csv
-rw-r--r-- 1 root root 93 Feb 27 03:44 part-00001-3ed9465e-8dc7-450e-a6cb-574878c00c69-c000.csv
-rw-r--r-- 1 root root  0 Feb 27 03:44 _SUCCESS


In [13]:
!cat mahasiswa_header/*

nama,jurusan,jalur,nilai
Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
nama,jurusan,jalur,nilai
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185


###Menulis dengan delimiter lain

In [14]:
!rm -rf mahasiswa_delim

In [15]:
df.write.csv("mahasiswa_delim", sep="|")

In [16]:
!cat mahasiswa_delim/*

Agus|Fisika|Umum|150
Windy|Fisika|Khusus|200
Budi|Biologi|Umum|170
Dina|Fisika|Khusus|180
Bayu|Fisika|Umum|160
Dedi|Biologi|Khusus|185


###Menggunakan fungsi `write.option()`

In [18]:
!rm -rf mahasiswa_delim

In [19]:
df.write.option("sep","|") \
    .option("header", True) \
    .csv("mahasiswa_delim")

##Menulis dengan `write.format`

In [20]:
!rm -rf mahasiswa_format

In [21]:
df.write.format("csv").save("mahasiswa_format")

In [22]:
!cat mahasiswa_format/*

Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185


###Menulis dengan header

In [23]:
!rm -rf mahasiswa_format_header

In [24]:
df.write.format("csv") \
    .option("header",True) \
    .save("mahasiswa_format_header")

In [25]:
!cat mahasiswa_format_header/*

nama,jurusan,jalur,nilai
Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
nama,jurusan,jalur,nilai
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185


###Menulis dengan delimiter lain

In [26]:
!rm -rf mahasiswa_format_delim

In [27]:
df.write.format("csv") \
    .option("header",True) \
    .option("sep","|") \
    .save("mahasiswa_format_delim")

In [28]:
!cat mahasiswa_format_delim/*

nama|jurusan|jalur|nilai
Agus|Fisika|Umum|150
Windy|Fisika|Khusus|200
Budi|Biologi|Umum|170
nama|jurusan|jalur|nilai
Dina|Fisika|Khusus|180
Bayu|Fisika|Umum|160
Dedi|Biologi|Khusus|185


##Mode penulisan

In [29]:
data_new = [['Citra','Fisika','Umum',170],
            ['Jaka','Biologi','Khusus',180]]

kolom = ["nama","jurusan","jalur","nilai"]

df_new = spark.createDataFrame(data_new, kolom)

df_new.show()

+-----+-------+------+-----+
| nama|jurusan| jalur|nilai|
+-----+-------+------+-----+
|Citra| Fisika|  Umum|  170|
| Jaka|Biologi|Khusus|  180|
+-----+-------+------+-----+



###Mode append - `write.csv`

In [30]:
df_new.write.csv("mahasiswa_header", header=True, mode="append")

In [31]:
!ls -l mahasiswa_header

total 16
-rw-r--r-- 1 root root 92 Feb 27 03:44 part-00000-3ed9465e-8dc7-450e-a6cb-574878c00c69-c000.csv
-rw-r--r-- 1 root root 47 Feb 27 03:45 part-00000-d0e38b87-6b78-4460-8b43-add36f4eb36d-c000.csv
-rw-r--r-- 1 root root 93 Feb 27 03:44 part-00001-3ed9465e-8dc7-450e-a6cb-574878c00c69-c000.csv
-rw-r--r-- 1 root root 49 Feb 27 03:45 part-00001-d0e38b87-6b78-4460-8b43-add36f4eb36d-c000.csv
-rw-r--r-- 1 root root  0 Feb 27 03:45 _SUCCESS


In [32]:
!cat mahasiswa_header/*

nama,jurusan,jalur,nilai
Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
nama,jurusan,jalur,nilai
Citra,Fisika,Umum,170
nama,jurusan,jalur,nilai
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185
nama,jurusan,jalur,nilai
Jaka,Biologi,Khusus,180


###Mode append - `write.format`

In [33]:
df_new.write.format("csv") \
    .option("header",True) \
    .mode("append") \
    .save("mahasiswa_format_header")

In [34]:
!ls -l mahasiswa_format_header

total 16
-rw-r--r-- 1 root root 47 Feb 27 03:45 part-00000-731475d1-7cea-45df-9b32-1ef74a0f1a34-c000.csv
-rw-r--r-- 1 root root 92 Feb 27 03:44 part-00000-834fb5fc-0d0e-4d3b-9098-d1a0eab24107-c000.csv
-rw-r--r-- 1 root root 49 Feb 27 03:45 part-00001-731475d1-7cea-45df-9b32-1ef74a0f1a34-c000.csv
-rw-r--r-- 1 root root 93 Feb 27 03:44 part-00001-834fb5fc-0d0e-4d3b-9098-d1a0eab24107-c000.csv
-rw-r--r-- 1 root root  0 Feb 27 03:45 _SUCCESS


In [35]:
!cat mahasiswa_format_header/*

nama,jurusan,jalur,nilai
Citra,Fisika,Umum,170
nama,jurusan,jalur,nilai
Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
nama,jurusan,jalur,nilai
Jaka,Biologi,Khusus,180
nama,jurusan,jalur,nilai
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185


###Mode overwrite - `write.csv`

In [36]:
df_new.write.csv("mahasiswa_header", header=True, mode="overwrite")

In [37]:
!cat mahasiswa_header/*

nama,jurusan,jalur,nilai
Citra,Fisika,Umum,170
nama,jurusan,jalur,nilai
Jaka,Biologi,Khusus,180


###Mode overwrite - `write.format`

In [38]:
df_new.write.format("csv") \
    .option("header",True) \
    .mode("overwrite") \
    .save("mahasiswa_header")

In [39]:
!cat mahasiswa_format_header/*

nama,jurusan,jalur,nilai
Citra,Fisika,Umum,170
nama,jurusan,jalur,nilai
Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
nama,jurusan,jalur,nilai
Jaka,Biologi,Khusus,180
nama,jurusan,jalur,nilai
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185


###Mode ignore - `write.csv`

In [40]:
df_new.write.csv("mahasiswa_header", header=True, mode="ignore")

In [41]:
!cat mahasiswa_header/*

nama,jurusan,jalur,nilai
Citra,Fisika,Umum,170
nama,jurusan,jalur,nilai
Jaka,Biologi,Khusus,180


###Mode ignore - `write.format`

In [42]:
df_new.write.format("csv") \
    .option("header",True) \
    .mode("overwrite") \
    .save("mahasiswa_format_header")

In [43]:
!cat mahasiswa_format_header/*

nama,jurusan,jalur,nilai
Citra,Fisika,Umum,170
nama,jurusan,jalur,nilai
Jaka,Biologi,Khusus,180


##Menyimpan dengan kompresi

In [44]:
df.write.csv("mahasiswa_gzip", header=True, compression="gzip")

In [45]:
!ls -ls mahasiswa_gzip

total 8
4 -rw-r--r-- 1 root root 99 Feb 27 03:45 part-00000-333b118a-afcb-4480-bde1-f595afc1bc64-c000.csv.gz
4 -rw-r--r-- 1 root root 98 Feb 27 03:45 part-00001-333b118a-afcb-4480-bde1-f595afc1bc64-c000.csv.gz
0 -rw-r--r-- 1 root root  0 Feb 27 03:45 _SUCCESS


In [50]:
df.write.format("csv") \
    .option("header",True) \
    .option("sep","|") \
    .option("compression","gzip") \
    .save("mahasiswa_format_gzip")

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/content/mahasiswa_format_gzip already exists. Set mode as "overwrite" to overwrite the existing path.

In [51]:
!ls -ls mahasiswa_format_gzip

total 8
4 -rw-r--r-- 1 root root 98 Feb 27 03:45 part-00000-eac2a643-db44-4fa8-8ce7-5b0300aa0647-c000.csv.gz
4 -rw-r--r-- 1 root root 97 Feb 27 03:45 part-00001-eac2a643-db44-4fa8-8ce7-5b0300aa0647-c000.csv.gz
0 -rw-r--r-- 1 root root  0 Feb 27 03:45 _SUCCESS


##Menyimpan dalam satu file

In [48]:
!rm -rf mahasiswa_coalesce

In [49]:
df.coalesce(1) \
    .write.format("csv") \
    .option("header",True) \
    .save("mahasiswa_coalesce")

In [52]:
!ls -l mahasiswa_coalesce

total 4
-rw-r--r-- 1 root root 160 Feb 27 03:45 part-00000-50a23307-3e38-4ba3-ac22-28b52b04f4f3-c000.csv
-rw-r--r-- 1 root root   0 Feb 27 03:45 _SUCCESS


In [53]:
!cat mahasiswa_coalesce/*

nama,jurusan,jalur,nilai
Agus,Fisika,Umum,150
Windy,Fisika,Khusus,200
Budi,Biologi,Umum,170
Dina,Fisika,Khusus,180
Bayu,Fisika,Umum,160
Dedi,Biologi,Khusus,185
