In [2]:
import pandas as pd
import pyarrow.parquet as pq
import sqlite3

In [None]:
pf = pq.ParquetFile("../data.parquet")
conn = sqlite3.connect("dictionary.db")
for batch in pf.iter_batches(batch_size=100):
    df = batch.to_pandas()
    df[["word", "definition"]].to_sql("dictionary", conn, if_exists="append", index=False)

In [13]:
class WordNotFoundException(Exception):
    def __init__(self, word):
        self.message = f"The word '{word}' does not exist in the dictionary."
        super().__init__(self.message)

class DatabaseFailureException(Exception):
    def __init__(self, error_message):
        self.message = f"Database failure: {error_message}"
        super().__init__(self.message)

class Database:
    def __init__(self, database_name):
        self.db = sqlite3.connect(database_name)
        self.db.row_factory = sqlite3.Row
        
    def populate(self, dataset_name):
        pf = pq.ParquetFile(dataset_name)
        current_id = 0
        for batch in pf.iter_batches(batch_size=1000):
            df = batch.to_pandas()
            df = df[["word", "definition"]]
            df.insert(0, "id", range(current_id, current_id + len(df)))
            df.to_sql("dictionary", self.db, if_exists="append", index=False)
            current_id += len(df)

    def word_exists(self, word):
        try:
            query = "SELECT 1 FROM dictionary WHERE word = ? LIMIT 1"
            cursor = self.db.execute(query, (word,))
            exists = cursor.fetchone() is not None
            return exists
        except Exception as e:
            raise DatabaseFailureException(str(e))

    def get_id(self, word):
        if not self.word_exists(word):
            raise WordNotFoundException(word)

        try:
            query = "SELECT id FROM dictionary WHERE word = ?"
            cursor = self.db.execute(query, (word,))
            result = cursor.fetchone()
            word_id = result["id"]
            return word_id
        except Exception as e:
            raise DatabaseFailureException(str(e))

    def get_dictionary_records(self, ids):
        try:
            placeholder = ",".join("?" for _ in ids)
            query = f"SELECT id, word, definition FROM dictionary WHERE id IN ({placeholder})"
            cursor = self.db.execute(query, ids)
            result = cursor.fetchall()
            dictionary_records = [dict(row) for row in result]
            return dictionary_records
        except Exception as e:
            raise DatabaseFailureException(str(e))

In [16]:
db = Database("dictionary.db")
db.populate("../data.parquet")

In [17]:
db.get_dictionary_records([4, 1000, 2024])

[{'id': 4,
  'word': 'abalone',
  'definition': 'A type of marine mollusk known for its shell.'},
 {'id': 1000,
  'word': 'ancient',
  'definition': 'Belonging to the very distant past.'},
 {'id': 2024,
  'word': 'barbel',
  'definition': 'A type of fish with elongated whiskers.'}]