# Arrow Interoperability with vroom-csv

vroom-csv implements the Arrow PyCapsule interface for zero-copy data exchange with PyArrow, Polars, and DuckDB.

In [None]:
import vroom_csv
import tempfile
import os

# Create sample data with different types
csv_content = """id,name,age,salary,active
1,Alice,30,75000.50,true
2,Bob,25,82000.00,false
3,Charlie,35,68000.25,true
4,Diana,28,91000.75,true
5,Eve,32,85000.00,false
"""

temp_dir = tempfile.mkdtemp()
csv_path = os.path.join(temp_dir, "data.csv")

with open(csv_path, "w") as f:
    f.write(csv_content)

## Type Inference

vroom-csv automatically infers column types from the data.

In [None]:
# Read with automatic type inference (default)
table = vroom_csv.read_csv(csv_path)

print(f"Columns: {table.column_names}")
print(f"Rows: {table.num_rows}")

## PyArrow Integration

Convert to PyArrow tables for advanced analytics and Arrow-based workflows.

In [None]:
import pyarrow as pa

# Zero-copy conversion to PyArrow
arrow_table = pa.table(table)

print("PyArrow Table:")
print(arrow_table)
print("\nSchema:")
print(arrow_table.schema)

In [None]:
# Convert to pandas DataFrame
df = arrow_table.to_pandas()
print("Pandas DataFrame:")
print(df)
print("\nData types:")
print(df.dtypes)

## Polars Integration

Use Polars for fast DataFrame operations with the lazy evaluation engine.

In [None]:
import polars as pl

# Convert to Polars DataFrame
table = vroom_csv.read_csv(csv_path)
df = pl.from_arrow(table)

print("Polars DataFrame:")
print(df)

In [None]:
# Use Polars operations
result = df.filter(pl.col("age") > 28).select(["name", "age", "salary"])
print("Filtered results (age > 28):")
print(result)

## DuckDB Integration

Query CSV data with SQL using DuckDB.

In [None]:
import duckdb

# Read with vroom-csv
table = vroom_csv.read_csv(csv_path)

# Query with DuckDB
result = duckdb.query("""
    SELECT name, age, salary
    FROM table
    WHERE active = true
    ORDER BY salary DESC
""")

print("DuckDB Query Results:")
print(result.fetchdf())

## Explicit Type Specification

You can override automatic type inference with the `dtype` parameter.

In [None]:
# Force specific types
table = vroom_csv.read_csv(
    csv_path,
    dtype={
        "id": "string",  # Keep as string instead of int
        "age": "int",
        "salary": "float",
        "active": "bool"
    }
)

# Check types in PyArrow
arrow_table = pa.table(table)
print("Schema with explicit types:")
print(arrow_table.schema)

## Null Value Handling

In [None]:
# Create data with nulls
null_content = """name,value
Alice,100
Bob,NA
Charlie,
Diana,200
"""

null_path = os.path.join(temp_dir, "nulls.csv")
with open(null_path, "w") as f:
    f.write(null_content)

# Read with custom null values
table = vroom_csv.read_csv(
    null_path,
    null_values=["NA"],
    empty_is_null=True
)

# Check null handling in PyArrow
arrow_table = pa.table(table)
print("Arrow table with nulls:")
print(arrow_table.to_pandas())

## Cleanup

In [None]:
import shutil
shutil.rmtree(temp_dir)
print("Cleaned up temporary files.")