In [None]:
from pyhive import hive
from common_vars import *

In [None]:
#USE WITH CAUTION

#DELETE and recreate the data and checkpoint directories in hdfs 



def reset_dir(hdfs_path):
    if fs.exists(hdfs_path):
        fs.delete(hdfs_path,recursive=True)
        print("directory deleted: " + hdfs_path)
    else:
        print("no such dir: " + hdfs_path)

    fs.mkdir(hdfs_path + '/sparkcheckpoint')    
    print("directory created: " + fs.ls(hdfs_path)[0])
    fs.chmod(hdfs_path + '/sparkcheckpoint', 0o777)   
    fs.chmod(hdfs_path, 0o777)    

In [None]:
#queries must be list of strings ["select from...", "insert into..." ]
def run_queries(queries):
    with hive_cnx.cursor() as cursor:   
        for q in queries:
            cursor.execute(q)    
        if cursor.poll().hasResultSet:
            return cursor.fetchall()

In [None]:
reset_dir(hdfs_archive_path)

In [None]:
reset_dir(hdfs_hive_staging)

## Create  HIVE Database

In [None]:
hive_cnx = hive.Connection(
  host = hdfs_host, 
  port = hive_port, 
  username = hive_username,
  password = hive_password,
  auth = hive_mode)

In [None]:
# q = f"drop database if exists {hive_database} cascade "

# with hive_cnx.cursor() as cursor:
#     cursor.execute(q)

In [None]:
reset_dir(f"{hdfs_hive_warehouse}/{hive_database}.db")

In [None]:
run_queries([f"create database if not exists {hive_database}"])

In [None]:
hive_cnx = hive.Connection(
  host = hdfs_host, 
  port = hive_port, 
  username = hive_username,
  password = hive_password,
  auth = hive_mode,
  database=hive_database)

## Create tweets table

In [None]:
table = "tweets"
colNames = tweet_keys.copy()
colNames += ["Sentiment","n_words"]

colTypes = [t().simpleString() for t in tweet_types]
colTypes += ['int','int']


In [None]:
list(zip(colNames,colTypes))

In [None]:
colCreate = ', '.join([n + ' ' + t for n,t in zip(colNames, colTypes)])
colCreate

In [None]:
reset_dir(hdfs_hive_tweets)

In [None]:
q =  f"CREATE EXTERNAL TABLE IF NOT EXISTS {table} ({colCreate}) \
partitioned by (key string)\n \
STORED AS PARQUET \
LOCATION  '{hdfs_hive_tweets}'"

# f"LOAD DATA INPATH '{hdfs_hive_staging}/*.parquet' \
# INTO TABLE tweets \
# partition by key "
print(q)

#TBLPROPERTIES ('avro.schema.url'='hdfs://$tempdir/avroSchema/$tbl.avsc') ;" 

In [None]:
run_queries([q])

## Create Users Table

In [None]:
table = "users"
staging = "users" # must actually be "users_staging", but the aggregation strategy does not work (see below) 
colNames = ['last_tweet_at'] + user_keys[:-1]
colTypes = [t().simpleString() for t in [StringType] + user_types][:-1]

In [None]:
colCreate = ', '.join([n + ' ' + t for n,t in zip(colNames, colTypes)])
colCreate

In [None]:
reset_dir(hdfs_hive_users)

In [None]:
reset_dir(hdfs_hive_users_staging)

In [None]:
q1 =  [f"CREATE EXTERNAL TABLE IF NOT EXISTS {table} ({colCreate}) \
partitioned by ({partitionCol} string) \
STORED AS PARQUET \
LOCATION '{hdfs_hive_users}'"]

# f"LOAD DATA INPATH '{hdfs_hive_staging}/*.parquet' \
# INTO TABLE tweets \
# partition by key "
print(q1)

#TBLPROPERTIES ('avro.schema.url'='hdfs://$tempdir/avroSchema/$tbl.avsc') ;" 

In [None]:
q1 +=  [f"CREATE TABLE IF NOT EXISTS {staging} ({colCreate}) \
partitioned by ({partitionCol} string) \
STORED AS PARQUET \
LOCATION '{hdfs_hive_users_staging}'"]

# f"LOAD DATA INPATH '{hdfs_hive_staging}/*.parquet' \
# INTO TABLE tweets \
# partition by key "
print(q1)

#TBLPROPERTIES ('avro.schema.url'='hdfs://$tempdir/avroSchema/$tbl.avsc') ;" 

In [None]:
run_queries(q1)