In [1]:
import re
import uuid
from bidict import bidict

In [2]:
element_by_symbol = bidict({'H': 'hydrogen'})
print("positive test:")
print("H" in element_by_symbol)
print("hydrogen" in element_by_symbol.inverse)
print(element_by_symbol.get('H'))
print(element_by_symbol.inverse.get('hydrogen'))

print("negative test:")
print("He" in element_by_symbol)
print("helium" in element_by_symbol.inverse)
print(element_by_symbol.get('He'))
print(element_by_symbol.inverse.get('helium'))

positive test:
True
True
hydrogen
H
negative test:
False
False
None
None


In [17]:
def normalize_text(s, ignore_case=True, ignore_space=True, 
                   ignore_non_ascii=True, strip_semi_colon_at_end=True):
    """Normalize text by removing noise and normalizing whitespace/punctuation.

    Args:
      s: The text string to preprocess.

    Returns:
      The preprocessed text string.
    """
    text = str(s)
    
    if ignore_case:
        text = text.lower()
        
    if ignore_non_ascii:
        # Remove punctuation (ASCII and non-ASCII)
        text = re.sub(r"[^\w\s]", "", text)

    if ignore_space:
        # replace various whitespace characters (ASCII and non-ASCII) with a single whitespace
        text = re.sub(r"\s+", " ", text)

    text = text.strip()
    if strip_semi_colon_at_end and text.endswith(";"):
        text = text[:-1]
        
    return text.strip()

In [18]:
s = "This is  a string ~ 人工智能 ~ with \t extra spaces and 非 ascii punctuation, a newline.\n !!!"
normalize_text(s)

'this is a string 人工智能 with extra spaces and 非 ascii punctuation a newline'

In [19]:
normalize_text(s, ignore_case=False, ignore_space=True, ignore_non_ascii=False)

'This is a string ~ 人工智能 ~ with extra spaces and 非 ascii punctuation, a newline. !!!'

In [20]:
# Create a bidirectional map to store SQL queries and their IDs
sql_index = bidict()         # key-SQL: value-ID

In [21]:
def add_query(sql_query, ignore_case=True, ignore_space=True, 
              ignore_non_ascii=False, strip_semi_colon_at_end=True):
    """
    Adds an SQL query to the index and returns its unique ID.

    Args:
      sql_query: The SQL query string.

    Returns:
      The unique ID assigned to the SQL query.
    """
    # Normalize the SQL query (remove unnecessary characters, etc.)
    normalized_query = normalize_text(sql_query, 
                                      ignore_case=ignore_case, 
                                      ignore_space=ignore_space, 
                                      ignore_non_ascii=ignore_non_ascii,
                                      strip_semi_colon_at_end=strip_semi_colon_at_end)

    # Check if the query already exists in the bidict
    if normalized_query in sql_index:
        return sql_index.get(normalized_query)  # return ID
    
    # Generate a unique ID (using UUID)
    unique_id = str(uuid.uuid4())
    # Add the query and ID to the bidict
    sql_index[normalized_query] = unique_id
    return unique_id

In [22]:
def get_query_by_id(unique_id):
    """
    Retrieves the SQL query for a given unique ID using the forward lookup method.

    Args:
      unique_id: The unique ID of the SQL query.

    Returns:
      The corresponding SQL query string, or None if not found.
    """
    return sql_index.inverse.get(unique_id)

In [23]:
def get_id_by_query(query, ignore_case=True, ignore_space=True, 
                    ignore_non_ascii=False, strip_semi_colon_at_end=True):
    """
    Retrieves the unique ID for a given normalized SQL query using the inverse lookup method.

    Args:
      normalized_query: The normalized SQL query string.

    Returns:
      The unique ID associated with the query, or None if not found.
    """
    normalized_query = normalize_text(query, 
                                      ignore_case=ignore_case, 
                                      ignore_space=ignore_space, 
                                      ignore_non_ascii=ignore_non_ascii,
                                      strip_semi_colon_at_end=strip_semi_colon_at_end)
    
    return sql_index.get(normalized_query)

In [24]:
# Example usage
sql_query1 = " SELECT * FROM users WHERE id = 10; "
sql_query2 = " SELECT name, email FROM customers ORDER BY name ASC; "

unique_id1 = add_query(sql_query1)
unique_id2 = add_query(sql_query2)

In [25]:
unique_id1, unique_id2

('bc983740-ab6b-49e9-931f-c5f8e5783401',
 '498fed83-bd18-4d5c-89dc-5a1a67c2a116')

In [26]:
get_query_by_id(unique_id1)

'select * from users where id = 10'

In [27]:
get_query_by_id(unique_id2)

'select name, email from customers order by name asc'

In [28]:
sql3 = """
    SELECT 
        name, 
        email 
    FROM customers 
    ORDER BY name ASC
    ; 
"""
id3 = get_id_by_query(sql3, ignore_non_ascii=False)

In [29]:
id3

'498fed83-bd18-4d5c-89dc-5a1a67c2a116'

In [30]:
normalize_text(sql3, ignore_non_ascii=False)

'select name, email from customers order by name asc'