In [None]:
import csv
import random
import polars as pl
import pandas as pd
import sys
import os
from pydantic import BaseModel

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from csv_validator.validator import (
    CsvReader,
    LazyCsvReader,
    CsvValidator,
)


def create_benchmark():
    with open("benchmark_1k.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["a", "b", "c"])
        for i in range(1, 1000):
            a = i
            b = f"val{i}"
            c = round(random.uniform(1.0, 5.0), 2)
            writer.writerow([a, b, c])


def create_complex_benchmark():
    with open("benchmark_1m_complex.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
        for i in range(1, 1000001):
            a = i
            b = f"val{i}"
            c = round(random.uniform(1.0, 5.0), 2)
            d = f"extra_val{i}"
            e = f"extra_val{i}"
            f = f"extra_val{i}"
            g = f"extra_val{i}"
            h = f"extra_val{i}"
            i = f"extra_val{i}"
            j = f"extra_val{i}"
            writer.writerow([a, b, c, d, e, f, g, h, i, j])


class BenchmarkSchema(BaseModel):
    a: int
    b: str
    c: float


class ComplexBenchmarkSchema(BaseModel):
    a: int
    b: str
    c: float
    d: str
    e: str
    f: str
    g: str
    h: str
    i: str
    j: str

In [2]:
eager_reader = CsvReader("benchmark_1m_complex.csv")

In [3]:
lazy_reader = LazyCsvReader("benchmark_1m_complex.csv")

In [None]:
eager_v = CsvValidator(schema=BenchmarkSchema, reader=eager_reader)
lazy_v = CsvValidator(schema=BenchmarkSchema, reader=lazy_reader)

In [5]:
%%timeit
eager_v.validate_all()

1.48 s ± 153 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
lazy_v.validate_all()

2.99 s ± 64.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
filename = "benchmark_1m_complex.csv"
pl_reader = PolarsReader(pl.read_csv(filename))
pd_reader = PandasReader(pd.read_csv(filename))
csv_reader = CsvReader(filename)

In [7]:
%%timeit
validator = PandasValidator(BenchmarkSchema, pd_reader)
validator.validate_all()

7.55 s ± 44.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
validator = PolarsValidator(BenchmarkSchema, pl_reader)
validator.validate_all()

  
  
  
  
  
  
  
  


2.28 s ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
validator = SlowerIterValidator(BenchmarkSchema, csv_reader)
validator.validate_all()

1.66 s ± 72.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%timeit
validator = IterValidator(BenchmarkSchema, csv_reader)
validator.validate_all()

1.03 s ± 94.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
