Import package yang akan kita gunakan

In [13]:
import pyspark
from pyspark.sql import SparkSession

Untuk melakukan koneksi ke Hive, kita perlu menjalankan fungsi enableHiveSupport() pada saat membuat spark session

In [14]:
spark = SparkSession.builder.appName('Hive Basics').enableHiveSupport().getOrCreate()

23/10/02 13:34:06 WARN Utils: Your hostname, dl247-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.15.130 instead (on interface ens33)
23/10/02 13:34:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/10/02 13:34:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Menjalankan perintah SHOW dan DESCRIBE

Untuk menjalankan SQL command ke dalam Hive, kita gunakan fungsi `spark.sql()`. Fungsi ini mengembalikan spark DataFrame, sehingga untuk menampilkannya kita perlu memanggil fungsi `show()`

In [17]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
|   mytest|
+---------+



Kita akan menggunakan database "mytest" untuk latihan ini

In [18]:
spark.sql("show tables from mytest").show()

[Stage 0:>                                                          (0 + 1) / 1]

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|   mytest|          emp|      false|
|   mytest|      emp_ext|      false|
|   mytest|  emp_landing|      false|
|   mytest|      emp_orc|      false|
|   mytest|     employee|      false|
|   mytest|    employee1|      false|
|   mytest| employee_ext|      false|
|   mytest|    mahasiswa|      false|
|   mytest|mahasiswa_ext|      false|
|   mytest|          mhs|      false|
+---------+-------------+-----------+



                                                                                

In [19]:
spark.sql("describe mytest.employee").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|        firstname|   string|   null|
|         lastname|   string|   null|
|            email|   string|   null|
|           gender|   string|   null|
|              age|      int|   null|
|         jobtitle|   string|   null|
|yearsofexperience|   bigint|   null|
|           salary|      int|   null|
|       department|   string|   null|
|        datestamp|     date|   null|
+-----------------+---------+-------+



In [20]:
spark.sql("describe formatted mytest.employee").show(truncate = False)

+----------------------------+----------------------------+-------+
|col_name                    |data_type                   |comment|
+----------------------------+----------------------------+-------+
|firstname                   |string                      |null   |
|lastname                    |string                      |null   |
|email                       |string                      |null   |
|gender                      |string                      |null   |
|age                         |int                         |null   |
|jobtitle                    |string                      |null   |
|yearsofexperience           |bigint                      |null   |
|salary                      |int                         |null   |
|department                  |string                      |null   |
|datestamp                   |date                        |null   |
|                            |                            |       |
|# Detailed Table Information|                  

In [21]:
spark.sql("describe formatted mytest.employee_ext").show(truncate = False)

+----------------------------+----------------------------+-------+
|col_name                    |data_type                   |comment|
+----------------------------+----------------------------+-------+
|firstname                   |string                      |null   |
|lastname                    |string                      |null   |
|email                       |string                      |null   |
|gender                      |string                      |null   |
|age                         |int                         |null   |
|jobtitle                    |string                      |null   |
|yearsofexperience           |bigint                      |null   |
|salary                      |int                         |null   |
|department                  |string                      |null   |
|datestamp                   |date                        |null   |
|                            |                            |       |
|# Detailed Table Information|                  

## Melakukan query ke tabel Hive 



In [22]:
spark.sql("SELECT * FROM mytest.employee limit 5").show()

[Stage 2:>                                                          (0 + 1) / 1]

+---------+--------+--------------------+------+---+--------------------+-----------------+------+----------+----------+
|firstname|lastname|               email|gender|age|            jobtitle|yearsofexperience|salary|department| datestamp|
+---------+--------+--------------------+------+---+--------------------+-----------------+------+----------+----------+
|     Jose|   Lopez|joselopez0944@sli...|  male| 25|     Project Manager|                1|  8500|   Product|2023-09-01|
|    Diane|  Carter|dianecarter1228@s...|female| 26|Machine Learning ...|                2|  7000|   Product|2023-09-01|
|    Shawn|  Foster|shawnfoster2695@s...|  male| 37|     Project Manager|               14| 17000|   Product|2023-09-01|
|   Brenda|  Fisher|brendafisher3185@...|female| 31|       Web Developer|                8| 10000|   Product|2023-09-01|
|     Sean|  Hunter|seanhunter4753@sl...|  male| 35|     Project Manager|               11| 14500|   Product|2023-09-01|
+---------+--------+------------

                                                                                

In [26]:
spark.sql("""SELECT count(*), gender, department FROM mytest.employee 
            GROUP BY gender, department""").show()

+--------+------+--------------+
|count(1)|gender|    department|
+--------+------+--------------+
|       1|female|       product|
|     159|  male|       Product|
|     143|female|       Product|
|       1|female|human resource|
|       5|  male|Human Resource|
|       2|     M|       Product|
|       2|  male|       product|
|       5|female|Human Resource|
|       2|     F|       Product|
+--------+------+--------------+



## Membuat managed tabel dari dataframe

Kita bisa membuat tabel dari sebuah dataframe. Untuk itu kita buat dataframenya terlebih dahulu

In [27]:
data = [['Agus','F',100,150,150],['Windy','F',200,150,180],
        ['Budi','B',200,100,150],['Dina','F',150,150,130],
        ['Bayu','F',50,150,100],['Dedi','B',50,100,100]]

kolom = ["nama","kode_jurusan","nilai1","nilai2","nilai3"]
df = spark.createDataFrame(data,kolom)
df.show()

                                                                                

+-----+------------+------+------+------+
| nama|kode_jurusan|nilai1|nilai2|nilai3|
+-----+------------+------+------+------+
| Agus|           F|   100|   150|   150|
|Windy|           F|   200|   150|   180|
| Budi|           B|   200|   100|   150|
| Dina|           F|   150|   150|   130|
| Bayu|           F|    50|   150|   100|
| Dedi|           B|    50|   100|   100|
+-----+------------+------+------+------+



In [None]:
#spark.sql("drop table mytest.mahasiswa")

Untuk menyimpan sebuah dataframe menjadi tabel kita menggunakan perintah `DataFrameWriter.saveAsTable()` ada beberapa parameter yang bisa kita pilih, diantaranya yaitu **mode** yang menyediakan pilihan nilai berupa : *append, overwrite, ignore, error, errorifexists*

Untuk contoh ini kita pilih mode *overwrite*, dan kita beri nama tabelnya *mahasiswa*

In [28]:
df.write.mode('overwrite') \
         .saveAsTable("mytest.mahasiswa")

                                                                                

In [29]:
spark.sql("show tables from mytest").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|   mytest|          emp|      false|
|   mytest|      emp_ext|      false|
|   mytest|  emp_landing|      false|
|   mytest|      emp_orc|      false|
|   mytest|     employee|      false|
|   mytest|    employee1|      false|
|   mytest| employee_ext|      false|
|   mytest|    mahasiswa|      false|
|   mytest|mahasiswa_ext|      false|
|   mytest|          mhs|      false|
+---------+-------------+-----------+



In [30]:
spark.sql("describe formatted mytest.mahasiswa").show(truncate=False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|nama                        |string                                                        |null   |
|kode_jurusan                |string                                                        |null   |
|nilai1                      |bigint                                                        |null   |
|nilai2                      |bigint                                                        |null   |
|nilai3                      |bigint                                                        |null   |
|                            |                                                              |       |
|# Detailed Table Information|                                                    

In [39]:
spark.sql("select * from mytest.mahasiswa").show()

+-----+------------+------+------+------+
| nama|kode_jurusan|nilai1|nilai2|nilai3|
+-----+------------+------+------+------+
| Agus|           F|   100|   150|   150|
|Windy|           F|   200|   150|   180|
| Budi|           B|   200|   100|   150|
| Dina|           F|   150|   150|   130|
| Bayu|           F|    50|   150|   100|
| Dedi|           B|    50|   100|   100|
+-----+------------+------+------+------+



## Membuat External Tabel dari DataFrame

In [None]:
!hdfs dfs -ls /user/hadoop/mydata

In [None]:
!hdfs dfs -mkdir /user/hadoop/mydata/mahasiswa

In [35]:
df.write.mode('overwrite') \
        .option("path", "hdfs://127.0.0.1:9000/user/hadoop/mydata/mahasiswa") \
        .saveAsTable("mytest.mahasiswa_ext")

In [33]:
spark.sql("describe extended mytest.mahasiswa_ext").show(truncate=False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|nama                        |string                                                        |null   |
|kode_jurusan                |string                                                        |null   |
|nilai1                      |bigint                                                        |null   |
|nilai2                      |bigint                                                        |null   |
|nilai3                      |bigint                                                        |null   |
|                            |                                                              |       |
|# Detailed Table Information|                                                    

In [41]:
spark.sql("SELECT * FROM mytest.mahasiswa_ext").show()

+-----+------------+------+------+------+
| nama|kode_jurusan|nilai1|nilai2|nilai3|
+-----+------------+------+------+------+
| Agus|           F|   100|   150|   150|
|Windy|           F|   200|   150|   180|
| Budi|           B|   200|   100|   150|
| Dina|           F|   150|   150|   130|
| Bayu|           F|    50|   150|   100|
| Dedi|           B|    50|   100|   100|
+-----+------------+------+------+------+



In [32]:
!hdfs dfs -ls /user/hadoop/mydata/mahasiswa

Found 3 items
-rw-r--r--   3 hadoop supergroup          0 2023-10-02 09:15 /user/hadoop/mydata/mahasiswa/_SUCCESS
-rw-r--r--   3 hadoop supergroup       1645 2023-10-02 09:15 /user/hadoop/mydata/mahasiswa/part-00000-186a9e72-b888-4439-878b-bbd9298c293a-c000.snappy.parquet
-rw-r--r--   3 hadoop supergroup       1641 2023-10-02 09:15 /user/hadoop/mydata/mahasiswa/part-00001-186a9e72-b888-4439-878b-bbd9298c293a-c000.snappy.parquet


## Membuat Managed Tabel dengan CREATE TABLE

In [None]:
spark.sql("drop table mytest.emp")
spark.sql("drop table mytest.emp_ext")

In [None]:
spark.sql("""CREATE TABLE IF NOT EXISTS mytest.emp(
firstname STRING,
lastname STRING,
email STRING,
gender STRING,
age INT,
jobtitle STRING,
yearsofexperience BIGINT,
salary INT,
department STRING)
STORED AS ORC;""")

In [None]:
spark.sql("describe extended mytest.emp").show(truncate=False)

In [None]:
!ls -l /home/hadoop/Downloads

In [None]:
!hdfs dfs -ls /user/hadoop/dataset

In [None]:
spark.sql("select count(*) from mytest.emp").show()

## Membuat External Table dengan CREATE TABLE

In [None]:
!ls -l /home/hadoop/datasets

In [None]:
!wget -P /home/hadoop/datasets https://github.com/urfie/temp/raw/main/emp.csv 

In [None]:
!ls -l /home/hadoop/datasets

In [None]:
!hdfs dfs -ls /user/hadoop/mydata

In [None]:
!hdfs dfs -mkdir /user/hadoop/mydata/emp

In [None]:
!hdfs dfs -put /home/hadoop/datasets/emp.csv /user/hadoop/mydata/emp

In [None]:
spark.sql("""CREATE  EXTERNAL TABLE mytest.emp_ext(
firstname STRING,
lastname STRING,
email STRING,
gender STRING,
age INT,
jobtitle STRING,
yearsofexperience BIGINT,
salary INT,
department STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs://127.0.0.1:9000/user/hadoop/mydata/emp'""")

In [None]:
spark.sql("select count(*) from mytest.emp_ext").show()

In [None]:
spark.sql("describe extended mytest.emp_ext").show(truncate=False)

In [None]:
spark.sql("select * from mytest.emp_ext limit 5").show()

## Insert into Hive Table from External Table

In [None]:
spark.sql("INSERT INTO mytest.emp SELECT * FROM mytest.emp_ext;")

In [None]:
spark.sql("select count(*) from mytest.emp").show()

In [None]:
spark.sql("select * from mytest.emp limit 5").show()

## Menjalankan fungsi Hive 

In [37]:
spark.sql("select lower(firstname), lower(lastname), lower(department) from mytest.emp limit 5").show()

+----------------+---------------+-----------------+
|lower(firstname)|lower(lastname)|lower(department)|
+----------------+---------------+-----------------+
|            jose|          lopez|          product|
|           diane|         carter|          product|
|           shawn|         foster|          product|
|          brenda|         fisher|          product|
|            sean|         hunter|          product|
+----------------+---------------+-----------------+

