# Create Hive schema for Parquet files

This notebook demonstrates how to create a hive table for existing Parquet files. <br>
This can be done for a single file as well as for multiple files residing under the same folder.

In [1]:
import os

### Creating of spark context with hive support.

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Import parquet schema to hive").config("hive.metastore.uris", "thrift://trino-hive:9083").enableHiveSupport().getOrCreate()

Define function below for getting sql script needed for creating table in hive using dataframe types as columns to table

In [3]:
def getCreateTableScript(databaseName, tableName, path, df, partitions=[]):
    #remove partition columns from the df to avoid repetition exception
    partition_names = map(lambda x: x.split(' ')[0] , partitions )
    ndf = df.drop(*partition_names)
        
    cols = ndf.dtypes
    createScript = "CREATE EXTERNAL TABLE " + databaseName + "." + tableName + "("
    colArray = []
    for colName, colType in cols:
        colArray.append(colName.replace(" ", "_") + " " + colType)
    createColsScript =   ", ".join(colArray ) + ") "
    partitionBy = ""
    if len(partitions) > 0:
        partitionBy = "PARTITIONED BY (" + ", ".join(partitions) + ") "
    script = createScript + createColsScript + partitionBy + " STORED AS PARQUET LOCATION '" + path + "'"
    print(script)
    return script
    

In [4]:
#define main function for creating table where arqument 'path' is path to parquet files 
def createTable(databaseName, tableName, path, partitions=[]): 
    df = spark.read.parquet(path)
    sqlScript = getCreateTableScript(databaseName, tableName, path, df, partitions)
    spark.sql(sqlScript)
    if len(partitions) > 0:
        spark.sql(f'msck repair table {databaseName}.{tableName}')

## One file example

In [5]:
# Set the path where the parquet file is located.
my_parqute_file_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/userdata1.parquet')

createTable("default","tab1_single_file",my_parqute_file_path)

CREATE EXTERNAL TABLE default.tab1_single_file(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string)  STORED AS PARQUET LOCATION 'v3io://users/dani/examples/userdata1.parquet'


In [6]:
%sql select * from hive.default.tab1_single_file limit 10

Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


## One folder example for spark output job

In [7]:
# Set the path where the parquet folder is located.
folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/spark-output/')

createTable("default","table_from_dir",folder_path)

CREATE EXTERNAL TABLE default.table_from_dir(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string)  STORED AS PARQUET LOCATION 'v3io://users/dani/examples/spark-output/'


In [8]:
%sql select * from hive.default.table_from_dir limit 10

Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


# Partitioned parquet example

Table partitioning is a common optimization approach used in systems like Hive. In a partitioned table, data are usually stored in different directories, with partitioning column values encoded in the path of each partition directory.

In [9]:
# Set path where parquet folder with parquet partitions are located indside.
folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/partitioned_pq')
#provide list of partitions and their type
partition_list = ["gender string"]                       
createTable("default", "partitioned_table", folder_path, partition_list)

CREATE EXTERNAL TABLE default.partitioned_table(registration_dttm timestamp, id int, first_name string, last_name string, email string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) PARTITIONED BY (gender string)  STORED AS PARQUET LOCATION 'v3io://users/dani/examples/partitioned_pq'


In [10]:
%sql select * from hive.default.partitioned_table limit 10

Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


# Adding new partitions

In [11]:
#Once added new partitions to the table, 
# it is required to run the below command in order for the hive metastore to be aware of the new files.
spark.sql('msck repair table default.partitioned_table')

DataFrame[]

# Browse the Metastore

In [12]:
# test how the tables were saved
#spark.sql("drop database test CASCADE")
databaseName = "default"

spark.sql("show databases").show()
spark.sql("show tables in " + databaseName).show()

+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  default|        csv_table|      false|
|  default|            demo1|      false|
|  default|            demo2|      false|
|  default|         example2|      false|
|  default|partitioned_table|      false|
|  default| tab1_single_file|      false|
|  default|   table_from_dir|      false|
|  default|  table_from_dir2|      false|
+---------+-----------------+-----------+



### Access Hive from command line

In order to run Hive from command line,open up a jupyter terminal and run "hive" <br>
To view all existing hive tables run: show tables; <br>
Here you can run queries without specifying Hive. <br>
e.g. select * from table_from_single_file2;

## Cleanup
This will only clean the metastore definitions.
<br>The underlying data won't be affected.

In [13]:
spark.sql("drop table " + databaseName + ".tab1_single_file")

DataFrame[]

In [14]:
spark.sql("drop table " + databaseName + ".table_from_dir")

DataFrame[]

In [15]:
spark.sql("drop table " + databaseName + ".partitioned_table")

DataFrame[]