# This notebook is to help automatically import parquet schema to hive

Below is import of all needed dependencies. And in this sell you should pass path where parquet files located. 

In [1]:
import os

Here is creating of spark context with hive support.

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Import parquet schema to hive").config("hive.metastore.uris", "thrift://hive:9083").enableHiveSupport().getOrCreate()

Define function below for getting sql script needed for creating table in hive using dataframe types as columns to table

In [3]:
def getCreateTableScript(databaseName, tableName, df):
    cols = df.dtypes
    createScript = "CREATE EXTERNAL TABLE " + databaseName + "." + tableName + "("
    colArray = []
    for colName, colType in cols:
        colArray.append(colName.replace(" ", "_") + " " + colType)
    createColsScript =   ", ".join(colArray )
    
    script = createScript + createColsScript + ") STORED AS PARQUET LOCATION '" + tableName + "'"
    print(script)
    return script
    

In [4]:
#define main function for creating table where arqument 'path' is path to parquet files 
def createTable(databaseName, tableName, path): 
    df = spark.read.parquet(path)
    sqlScript = getCreateTableScript(databaseName, tableName, df)
    spark.sql(sqlScript)

## One file example

In [5]:
# Set path where concrete parquet file located.
my_parqute_file_path = os.path.join('v3io://bigdata/examples/example1.parquet')
createTable("test","table_from_single_file",my_parqute_file_path)

CREATE EXTERNAL TABLE test.table_from_single_file(id bigint, diagnosis string, radius_mean double, texture_mean double, perimeter_mean double, area_mean double, smoothness_mean double, compactness_mean double, concavity_mean double, concave_points_mean double, symmetry_mean double, fractal_dimension_mean double, radius_se double, texture_se double, perimeter_se double, area_se double, smoothness_se double, compactness_se double, concavity_se double, concave_points_se double, symmetry_se double, fractal_dimension_se double, radius_worst double, texture_worst double, perimeter_worst double, area_worst double, smoothness_worst double, compactness_worst double, concavity_worst double, concave_points_worst double, symmetry_worst double, fractal_dimension_worst double) STORED AS PARQUET LOCATION 'table_from_single_file'


## One folder example

In [6]:
# Set path where parquet folder with parquet files inside located.
folder_path = os.path.join('v3io://users/admin/examples/parquet_examples/dir1/*')
createTable("test","table_from_dir",folder_path)

CREATE EXTERNAL TABLE test.table_from_dir(id bigint, diagnosis string, radius_mean double, texture_mean double, perimeter_mean double, area_mean double, smoothness_mean double, compactness_mean double, concavity_mean double, concave_points_mean double, symmetry_mean double, fractal_dimension_mean double, radius_se double, texture_se double, perimeter_se double, area_se double, smoothness_se double, compactness_se double, concavity_se double, concave_points_se double, symmetry_se double, fractal_dimension_se double, radius_worst double, texture_worst double, perimeter_worst double, area_worst double, smoothness_worst double, compactness_worst double, concavity_worst double, concave_points_worst double, symmetry_worst double, fractal_dimension_worst double) STORED AS PARQUET LOCATION 'table_from_dir'


# Multiple files and folders example

Write here name of database and path to folder where all parquet files (or folders with them) located. Database should be created.
In this sell code goes over all files and dirs in provided path and using them for creating table.
File should be ended with .parquet format
Directory (in which stored parquet files) should be started with "."
Name of directory or file will be name of table.

In [7]:
databaseName = "test"
filepath = "/v3io/users/admin/examples/parquet_examples"

for fileOrDir in os.listdir(filepath):
    if fileOrDir.endswith(".parquet") :
        createTable(databaseName, fileOrDir.split(".parquet")[0], filepath.replace("/v3io/", "v3io://", 1) + "/" + fileOrDir)
    elif not fileOrDir.startswith(".") :
        createTable(databaseName, fileOrDir, filepath.replace("/v3io/", "v3io://", 1) + "/" + fileOrDir + "/*")



CREATE EXTERNAL TABLE test.dir1(id bigint, diagnosis string, radius_mean double, texture_mean double, perimeter_mean double, area_mean double, smoothness_mean double, compactness_mean double, concavity_mean double, concave_points_mean double, symmetry_mean double, fractal_dimension_mean double, radius_se double, texture_se double, perimeter_se double, area_se double, smoothness_se double, compactness_se double, concavity_se double, concave_points_se double, symmetry_se double, fractal_dimension_se double, radius_worst double, texture_worst double, perimeter_worst double, area_worst double, smoothness_worst double, compactness_worst double, concavity_worst double, concave_points_worst double, symmetry_worst double, fractal_dimension_worst double) STORED AS PARQUET LOCATION 'dir1'
CREATE EXTERNAL TABLE test.example1(id bigint, diagnosis string, radius_mean double, texture_mean double, perimeter_mean double, area_mean double, smoothness_mean double, compactness_mean double, concavity_mean

# Test how it works

In [26]:
# test how the tables were saved
#spark.sql("drop database test CASCADE")
spark.sql("drop table " + databaseName + ".example1")
spark.sql("show databases").show()
spark.sql("show tables in " + databaseName).show()

+------------+
|databaseName|
+------------+
|     default|
|        test|
+------------+

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [None]:
# test how saving to table works
tableName = "example1"
spark.sql("select * from " + databaseName + "." + tableName)