In [125]:
from sqlalchemy import create_engine, inspect
from sqlalchemy.types import *
import re

In [61]:
type_mappings = {
    "postgres": {
        "serial": "INT",
        "bigserial": "BIGINT",
        "smallint": "SMALLINT",
        "integer": "INT",
        "bigint": "BIGINT",
        "boolean": "BOOLEAN",
        "decimal": "DECIMAL",
        "numeric": "DECIMAL",
        "real": "FLOAT",
        "double precision": "DOUBLE",
        "timestamp": "TIMESTAMP",
        "date": "DATE",
        "time": "STRING",
        "character varying": "STRING",
        "text": "STRING",
    },
    "oracle": {
        "number": "DECIMAL",
        "varchar2": "STRING",
        "char": "STRING",
        "nvarchar2": "STRING",
        "date": "DATE",
        "timestamp": "TIMESTAMP",
        "clob": "STRING",
        "blob": "BINARY",
        "float": "FLOAT",
        "long": "BIGINT",
    },
    "mysql": {
        "tinyint": "TINYINT",
        "smallint": "SMALLINT",
        "mediumint": "INT",
        "int": "INT",
        "bigint": "BIGINT",
        "decimal": "DECIMAL",
        "float": "FLOAT",
        "double": "DOUBLE",
        "date": "DATE",
        "datetime": "TIMESTAMP",
        "timestamp": "TIMESTAMP",
        "varchar": "STRING",
        "char": "STRING",
        "text": "STRING",
    }
}

In [234]:

def get_hive_type(db_type , column_type):
   
    match = re.match(r"\w+\((\d+),\s*(\d+)\)",column_type)
    if match:
        type_name = match.group().split('(')[0]
        precision = int(match.groups()[0])
        scale = int(match.groups()[1])
        if precision > 38:
            precision = 38
        if (precision >0 and precision <= 38) and (scale >=0 and scale <= precision):
            return f"DECIMAL({precision},{scale})"
    
    if db_type in type_mappings and column_type in type_mappings[db_type]:
        return type_mappings[db_type][column_type]
    
    
    return 'STRING'  

def convert_schema_to_hive(engine, inspector, db_schema, db_type):
    
    schema = {}
    
    for table_name in inspector.get_table_names(db_schema):
        columns = []
        for column in inspector.get_columns(table_name = table_name, schema = db_schema):
            hive_type = get_hive_type(db_type,str(column['type']).lower())
            columns.append((column['name'], hive_type))
        schema[table_name] = columns

    return schema

def generate_sql_ddl(hive_schema, schema_name, table_name, location = '/staging/ois' , stored_as = 'PARQUET'):
    ddl = f"CREATE EXTERNAL TABLE IF NOT EXISTS {schema_name}.{table_name} (\n"
    cols= []
    for col_nm,col_type in hive_schema[table_name]:
        cols.append(f"{col_nm} {col_type}")
        
    ddl += "    "
    ddl += ",\n    ".join(cols)
    ddl += "\n)\n"
    ddl += f"STORED AS {stored_as}\n"
    ddl += f"LOCATION '{location}';\n\n"
    return ddl


In [72]:
conn_string = "postgresql://postgres:P%40ssw0rdsit@192.168.170.13:5432/test"
engine = create_engine(conn_string)
inspector = inspect(engine)

 
hive_schema = convert_schema_to_hive(engine, inspector, inspector.get_schema_names()[0],'postgres')
ddl = generate_sql_ddl(hive_schema, 'default','WaterPump','/user/bdaadmin')

In [85]:
for table_name in inspector.get_table_names('OIS'):
    print(f"TABLE: {table_name}")
    for count, column in enumerate(inspector.get_columns(table_name=table_name, schema ='OIS')):
        
        print(f"NAME : {column['name']}   OLD DATA TYPE : {column['type']}   ----    HIVE DATA TYPE : {hive_schema[table_name][count][1]}")

TABLE: WaterPump
NAME : yyyymmdd   OLD DATA TYPE : BIGINT   ----    HIVE DATA TYPE : BIGINT
NAME : office_wwcode   OLD DATA TYPE : BIGINT   ----    HIVE DATA TYPE : BIGINT
NAME : office_mcode   OLD DATA TYPE : BIGINT   ----    HIVE DATA TYPE : BIGINT
NAME : water_type_code   OLD DATA TYPE : TEXT   ----    HIVE DATA TYPE : STRING
NAME : water_type_name   OLD DATA TYPE : TEXT   ----    HIVE DATA TYPE : STRING
NAME : water_id   OLD DATA TYPE : BIGINT   ----    HIVE DATA TYPE : BIGINT
NAME : water_name   OLD DATA TYPE : TEXT   ----    HIVE DATA TYPE : STRING
NAME : plant_id   OLD DATA TYPE : BIGINT   ----    HIVE DATA TYPE : BIGINT
NAME : plant_name   OLD DATA TYPE : TEXT   ----    HIVE DATA TYPE : STRING
NAME : untreated_water   OLD DATA TYPE : DOUBLE PRECISION   ----    HIVE DATA TYPE : DOUBLE
NAME : menu_water   OLD DATA TYPE : DOUBLE PRECISION   ----    HIVE DATA TYPE : DOUBLE
TABLE: WaterLevel
NAME : office_wwcode   OLD DATA TYPE : BIGINT   ----    HIVE DATA TYPE : BIGINT
NAME : offic

In [106]:
print(generate_sql_ddl(hive_schema, 'default','WaterPump','/user/bdaadmin'))

CREATE EXTERNAL TABLE IF NOT EXISTS default.WaterPump (
yyyymmdd BIGINT,
office_wwcode BIGINT,
office_mcode BIGINT,
water_type_code STRING,
water_type_name STRING,
water_id BIGINT,
water_name STRING,
plant_id BIGINT,
plant_name STRING,
untreated_water DOUBLE,
menu_water DOUBLE
)
STORED AS Parquet
LOCATION '/user/bdaadmin';




In [124]:
schema_hive_all = {}
ddl = ''
for schema in inspector.get_schema_names():
    schema_hive_all[schema] =convert_schema_to_hive(engine, inspector, schema,'postgres')
    
for schema in schema_hive_all:
    for table in schema_hive_all[schema]:
        print(generate_sql_ddl(schema_hive_all[schema], schema, table))

CREATE EXTERNAL TABLE IF NOT EXISTS OIS.WaterPump (
yyyymmdd BIGINT,
office_wwcode BIGINT,
office_mcode BIGINT,
water_type_code STRING,
water_type_name STRING,
water_id BIGINT,
water_name STRING,
plant_id BIGINT,
plant_name STRING,
untreated_water DOUBLE,
menu_water DOUBLE
)
STORED AS Parquet
LOCATION '/staging/OIS';


CREATE EXTERNAL TABLE IF NOT EXISTS OIS.WaterLevel (
office_wwcode BIGINT,
office_id BIGINT,
office_showname STRING,
plant_id BIGINT,
plant_name STRING,
water_id BIGINT,
water_name STRING,
machine_id BIGINT,
machine_code STRING,
category_code STRING,
category_name STRING,
category_style STRING,
group_code DOUBLE,
group_neme STRING,
untreated_date BIGINT,
volume DOUBLE,
level DOUBLE
)
STORED AS Parquet
LOCATION '/staging/OIS';


CREATE EXTERNAL TABLE IF NOT EXISTS OIS.CustomerM09 (
yyyymmdd BIGINT,
office_wwcode BIGINT,
sub_mcode BIGINT,
object_code BIGINT,
size_code BIGINT,
total_customer DOUBLE,
total_home DOUBLE,
total_gov DOUBLE,
total_small DOUBLE,
total_state DOUBLE

In [236]:
conn_string = "postgresql://postgres:P%40ssw0rdsit@192.168.170.13:5432/test1"
engine = create_engine(conn_string)
inspector = inspect(engine)

hive_schema = convert_schema_to_hive(engine, inspector, 'public','postgres')

In [237]:
# Test for DECIMAL(X,X)

In [238]:
hive_schema

{'employee': [('id', 'INT'), ('name', 'STRING'), ('salary', 'DECIMAL(10,2)')]}

In [239]:
inspector.get_columns(table_name = 'employee',schema='public')

[{'name': 'id',
  'type': INTEGER(),
  'nullable': False,
  'default': 'nextval(\'"public".employee_id_seq\'::regclass)',
  'autoincrement': True,
  'comment': None},
 {'name': 'name',
  'type': VARCHAR(length=50),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'salary',
  'type': NUMERIC(precision=10, scale=2),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None}]

In [218]:
print(generate_sql_ddl(hive_schema, 'public', 'employee', location = '/staging/ois' , stored_as = 'PARQUET'))

CREATE EXTERNAL TABLE IF NOT EXISTS public.employee (
    id INT,
    name STRING,
    salary DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION '/staging/ois';


