In [None]:
import sqlite3
from pathlib import Path, PosixPath
from typing import NamedTuple
from dotenv import dotenv_values

In [None]:
def get_env_values() -> dict[str, str|None]:
    return dotenv_values()

In [None]:
env_values: dict[str, str|None] = get_env_values()

# NamedTuple type hint
class ParametersType(NamedTuple):
    data_dir: PosixPath # Platform neutral pathlib PosixPath to data directory
    acs_path: PosixPath # Platform neutral pathlib PosixPath to ACS data
    db_path: PosixPath # Platform neutral pathlib PosixPath to SQLite3 database
    db_connection: sqlite3.Connection # SQLite3 database connection
    openai_api_key: str # OpenAI API key
    huggingfacehub_api_token: str # HuggingFace API token

Parameters: ParametersType = ParametersType(
    data_dir = Path.cwd() / "Data",
    acs_path = Path.cwd() / "Data/ACS_2012_21.csv",
    db_path= Path.cwd() / "Data/data.sqlite3",
    db_connection = sqlite3.connect(Path.cwd() / "Data/data.sqlite3"),  # ":memory:", "Data/data.sqlite3", "Data/acs.sqlite3"
    openai_api_key = env_values["OPENAI_API_KEY"],
    huggingfacehub_api_token = env_values["HUGGINGFACEHUB_API_TOKEN"],
)

In [None]:
class DatabaseInfoExtractor:
    """
    A class for extracting information about tables and columns from a SQLite database.

    Attributes:
    - db_path (str): The path to the SQLite database file.
    - conn (sqlite3.Connection): The connection object to the database.
    - cursor (sqlite3.Cursor): The cursor object for executing SQL queries.

    Methods:
    - extract_info() -> dict[str, dict[str, list[str]]]: Returns a dictionary containing information about each table in the database.
    """
    def __init__(self, db_path: PosixPath) -> None:
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path)
        self.cursor = self.conn.cursor()

    def extract_info(self) -> dict[str, dict[str, list[str]]]:
        tables: dict[str, str] = {}
        self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        table_names: list[str] = [row[0] for row in self.cursor.fetchall()]
        for table_name in table_names:
            columns: list[str] = []
            column_types: list[str] = []
            self.cursor.execute(f"PRAGMA table_info({table_name})")
            for row in self.cursor.fetchall():
                columns.append(row[1])
                column_types.append(row[2])
            tables[table_name] = {"columns": columns, "column_types": column_types}
        return tables

In [None]:
# Instantiate DatabaseInfoExtractor
db_info_extractor = DatabaseInfoExtractor(Parameters.db_path)

# Extract information about the database
db_info = db_info_extractor.extract_info()
print(db_info)

In [1]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

# Install punkt and stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/gozer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gozer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gozer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def find_synonym(word: str) -> str:
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    if len(synonyms) > 0:
        return synonyms[0]
    else:
        return "No synonym found"

In [3]:
# Sample sentence
# sentence = "This is a sample sentence, showing off the stop words filtration."
# sentence = "How many geographic areas are in the acs table?"
sentence = "How many five to nine year olds are in Alabama in 2021?"

# Tokenize the sentence
tokens = word_tokenize(sentence)

# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
print(filtered_tokens)

# Join the filtered tokens back into a sentence
# filtered_sentence = ' '.join(filtered_tokens)
# print(filtered_sentence)


['many', 'five', 'nine', 'year', 'olds', 'alabama', '2021']


In [4]:
for w in filtered_tokens:
    print(f"{w=} - {find_synonym(w)=}")

w='many' - find_synonym(w)='many'
w='five' - find_synonym(w)='five'
w='nine' - find_synonym(w)='nine'
w='year' - find_synonym(w)='year'
w='olds' - find_synonym(w)='old'
w='alabama' - find_synonym(w)='Alabama'
w='2021' - find_synonym(w)='No synonym found'
