In [None]:
import re
import json
import os
import utils

DATA_DIR = "data/imdb"

def parse_schema(file_path):
    with open(file_path, "r") as file:
        schema_content = file.read()

    table_re = re.compile(r"CREATE TABLE\s+([^\s]+)\s?\((.*?)\)\s*;", re.DOTALL | re.IGNORECASE)
    column_re = re.compile(r"([a-zA-Z0-9_]+)\s+([^,\n]+)(?:,|\n)", re.DOTALL | re.IGNORECASE)
    fk_re = re.compile(r"FOREIGN KEY \(([^)]+)\) REFERENCES ([^\s]+)\(([^)]+)\)", re.DOTALL | re.IGNORECASE)

    schema = {}

    for table_match in table_re.finditer(schema_content):
        table_name = table_match.group(1)
        table_body = table_match.group(2)

        print(table_body)

        columns = {}
        fks = {}

        for column_match in column_re.finditer(table_body):
            column_name = column_match.group(1)
            column_def = column_match.group(2)
            if column_name.upper() == "FOREIGN":
                continue
            parts = column_def.split()

            length = None
            data_type = parts[0].lower()
            if data_type == 'character':
                assert 'varying' in parts[1].lower()

                # Default data type.
                data_type = 'varchar'

                # Extract the length, if any.
                if "(" in parts[1].lower() and ")" in column_def:
                  length = parts[1].lower().split("(")[1].split(")")[0]
                  assert length.isdigit()

                  if int(length) == 1:
                    data_type = 'char'
                   
            not_null = "NOT NULL" in column_def.upper()
            columns[column_name] = {"type": data_type, "length": length, "not-null": not_null}

        for fk_match in fk_re.finditer(table_body):
            print(fk_match)
            fk_column = fk_match.group(1)
            ref_table = fk_match.group(2)
            ref_column = fk_match.group(3)
            fks[fk_column] = {"table": ref_table, "pk": ref_column}

        schema[table_name] = {"columns": columns, "fks": fks}
    
    return schema

schema_file = os.path.join(DATA_DIR, 'schema.sql')
schema_data = parse_schema(schema_file)
utils.write_json(os.path.join(DATA_DIR, 'schema.json'), schema_data)