In [1]:
import json
import duckdb
import requests
import polars as pl
import pyiceberg as pi
import sqlalchemy as sa

from tqdm import tqdm
from faker import Faker
from minio import Minio
from unidecode import unidecode
from types import SimpleNamespace
from tempfile import NamedTemporaryFile
from pyiceberg.catalog import load_catalog

In [13]:
from minio import Minio

def get_storage(host: str):
    client = Minio(endpoint=host, access_key="admin", secret_key="password", secure=False)
    return client

def get_storage_options(host: str, fs: bool=False):
    if fs:
        options = {
            "key": "admin",
            "secret": "password",
            "client_kwargs": {
                "region_name": "us-east-1",
                "endpoint_url": "http://" + host,
                "verify": False
            }
        }
    else:
        options = {
            "aws_access_key_id": "admin",
            "aws_secret_access_key": "password",
            "aws_endpoint_url": "http://" + host,
            "region_name": "us-east-1",
        }
    return options


get_storage.options = get_storage_options

In [11]:
import psycopg2
import sqlalchemy as sa

def get_database(host: str):
    engine = sa.create_engine(f"postgresql://admin:password@{host}:5432/ecommerce")
    return engine
    
def get_database_url(host: str, autocommit=False):
    return f"postgresql://admin:password@{host}:5432/ecommerce"

def get_database_pg(host: str, autocommit=False):
    conn = psycopg2.connect(host=f"{host}.io", port=5432, user="admin", password="password", dbname="ecommerce")
    conn.autocommit = autocommit
    return conn

get_database.pg = get_database_pg
get_database.url = get_database_url

In [2]:
from pymongo import MongoClient

def get_document(host: str):
    client = MongoClient(f"mongodb://admin:password@{host}:27017/admin")
    return client

In [6]:
from trino.dbapi import connect

def get_trino(catalog: str, schema: str="public"):
    connection = connect(host="trino.sql", port=80, user="admin", catalog=catalog, schema="public")
    return connection

In [12]:
# Jupyter Notebook: 192.168.1.88
# Spark Web UI: 192.168.1.88:4040
from pyspark.sql import SparkSession

def get_spark(mode: str = "local", catalog: str = "", storage: str = ""):
    session = SparkSession.builder.appName("notebook")

    if mode == "client":
        session = (
            session
            .master("k8s://https://kubernetes.default.svc.cluster.local:443")
            .config("spark.driver.memory", "2G")
            .config("spark.executor.cores", "1")
            .config("spark.executor.memory", "2G")
            .config("spark.executor.instances", "2")
            .config("spark.kubernetes.namespace", "spark")
            .config("spark.kubernetes.container.image", "registry.io/spark")
            .config("spark.submit.deployMode", "client")
            .config("spark.driver.bindAddress", "0.0.0.0")
            .config("spark.driver.host", "notebook-headless.io")
        )

    if catalog:
        session = (
            session
            .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
            .config("spark.sql.defaultCatalog", "iceberg")
            .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
            .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
            .config("spark.sql.catalog.iceberg.type", "rest")
            .config("spark.sql.catalog.iceberg.uri", "http://iceberg.io")
            .config("spark.sql.catalog.iceberg.warehouse", "s3://ecommerce/iceberg/")
            .config("spark.sql.catalog.iceberg.client.region", "us-east-1")
            .config("spark.sql.catalog.iceberg.s3.access-key-id", "admin")
            .config("spark.sql.catalog.iceberg.s3.secret-access-key", "password")
            .config("spark.sql.catalog.iceberg.s3.endpoint", "http://lakehouse.io")
            .config("spark.sql.catalog.iceberg.s3.path-style-access", "true")
        )

    if storage:
        session = (
            session
            .config("spark.hadoop.fs.s3a.access.key", "admin")
            .config("spark.hadoop.fs.s3a.secret.key", "password")
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{storage}")
            .config("spark.hadoop.fs.s3a.endpoint.region", "us-east-1")
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
        )

    spark = session.getOrCreate()
    return spark