# Create Hive schema for Parquet files

This notebook demonstrates how to create a hive table for existing Parquet files. <br>
This can be done for a single file as well as for multiple files residing under the same folder.

In [1]:
import os

### Creating of spark context with hive support.

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Import parquet schema to hive").config("hive.metastore.uris", "thrift://hive:9083").enableHiveSupport().getOrCreate()

Define function below for getting sql script needed for creating table in hive using dataframe types as columns to table

In [3]:
def getCreateTableScript(databaseName, tableName, path, df):
    cols = df.dtypes
    createScript = "CREATE EXTERNAL TABLE IF NOT EXISTS " + databaseName + "." + tableName + "("
    colArray = []
    for colName, colType in cols:
        colArray.append(colName.replace(" ", "_") + " " + colType)
    createColsScript =   ", ".join(colArray )
    
    script = createScript + createColsScript + ") STORED AS PARQUET LOCATION '" + path + "'"
    print(script)
    return script
    

In [4]:
#define main function for creating table where arqument 'path' is path to parquet files 
def createTable(databaseName, tableName, path): 
    df = spark.read.parquet(path)
    sqlScript = getCreateTableScript(databaseName, tableName, path, df)
    spark.sql(sqlScript)

## One file example

In [5]:
# Set the path where the parquet file is located.
my_parqute_file_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/userdata1.parquet')

createTable("default","tab1_single_file",my_parqute_file_path)

CREATE EXTERNAL TABLE IF NOT EXISTS default.tab1_single_file(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) STORED AS PARQUET LOCATION 'v3io://users/adi/examples/userdata1.parquet'


## One folder example for spark output job

In [6]:
# Set the path where the parquet folder is located.
folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/spark-output/*')

createTable("default","table_from_dir",folder_path)

CREATE EXTERNAL TABLE IF NOT EXISTS default.table_from_dir(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) STORED AS PARQUET LOCATION 'v3io://users/adi/examples/spark-output/*'


# Multiple files and folders example

In this example change the name of the database and path to the folder where all parquet files (or folders with them) are located. <br>
This code goes over all files and dirs in the provided path and uses them for creating tables.
File should be ended with .parquet format
Directory (in which stored parquet files) should be started with "."
Name of directory or file will be name of table.

In [14]:
databaseName = "default"
filepath = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/multiple-parquet-files')


for fileOrDir in os.listdir(filepath):
    if fileOrDir.endswith(".parquet") :
        createTable(databaseName, fileOrDir.split(".parquet")[0], filepath.replace("/v3io/", "v3io://", 1) + "/" + fileOrDir)
    elif not fileOrDir.startswith(".") :
        createTable(databaseName, fileOrDir, filepath.replace("/v3io/", "v3io://", 1) + "/" + fileOrDir + "/*")



FileNotFoundError: [Errno 2] No such file or directory: 'v3io://users/adi/examples/multiple-parquet-files'

# Test how it works

In [7]:
# test how the tables were saved
#spark.sql("drop database test CASCADE")
databaseName = "default"

spark.sql("show databases").show()
spark.sql("show tables in " + databaseName).show()

+------------+
|databaseName|
+------------+
|     default|
+------------+

+--------+----------------+-----------+
|database|       tableName|isTemporary|
+--------+----------------+-----------+
| default|tab1_single_file|      false|
| default|  table_from_dir|      false|
+--------+----------------+-----------+



In [8]:
# test how saving to table works
tableName = "table_from_dir"
spark.sql("select * from " + databaseName + "." + tableName)

DataFrame[registration_dttm: timestamp, id: int, first_name: string, last_name: string, email: string, gender: string, ip_address: string, cc: string, country: string, birthdate: string, salary: double, title: string, comments: string]

### Run select via Hive

In [9]:
%sql select * from hive.default.tab1_single_file limit 10

Done.


registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
2016-02-03 07:55:29.000,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
2016-02-03 06:47:06.000,8,Harry,Howell,hhowell7@eepurl.com,Male,91.235.51.73,,Bosnia and Herzegovina,3/1/1962,186469.43,Web Developer IV,
2016-02-03 03:52:53.000,9,Jose,Foster,jfoster8@yelp.com,Male,132.31.53.61,,South Korea,3/27/1992,231067.84,Software Test Engineer I,100.0
2016-02-03 18:29:47.000,10,Emily,Stewart,estewart9@opensource.org,Female,143.28.251.245,3574254110301671.0,Nigeria,1/28/1997,27234.28,Health Coach IV,
2016-02-03 00:36:21.000,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,
2016-02-03 05:05:31.000,5,Carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850.0,South Africa,,,,
2016-02-03 07:22:34.000,6,Kathryn,White,kwhite5@google.com,Female,195.131.81.179,3583136326049310.0,Indonesia,2/25/1983,69227.11,Account Executive,
2016-02-03 08:33:08.000,7,Samuel,Holmes,sholmes6@foxnews.com,Male,232.234.81.197,3582641366974690.0,Portugal,12/18/1987,14247.62,Senior Financial Analyst,
2016-02-03 17:04:03.000,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2016-02-03 01:09:31.000,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,


### Access Hive from command line

In order to run Hive from command line,open up a jupyter terminal and run "hive" <br>
To view all existing hive tables run: show tables; <br>
Here you can run queries without specifying Hive. <br>
e.g. select * from table_from_single_file2;

In [10]:
spark.sql("drop table " + databaseName + ".tab1_single_file")

DataFrame[]

In [11]:
spark.sql("drop table " + databaseName + ".table_from_dir")

DataFrame[]