# Polars DataFrame

1. Create DataFrame
2. Fake Dataset
3. Dataset Exploration
4. Read & Write (I/O)
5. I/O Pitfalls
 
Let's start.

## Create DataFrame

In [None]:
import polars as pl

df = pl.DataFrame(
    data={
        "numbers": [2, 3, 4, 5, 6, 7],
        "characters": ["f", "g", "h", "i", "j", "k"],
    }
)

# no index <-> pandas 
df

In [None]:
# REPL returns print 
print(df)

In [None]:
# fails: types inside column are inconsistent
pl.DataFrame(
    data={
        "numbers": [2, "Not a Number", 4, 5, 6, 7],
        "characters": ["f", "g", "h", "i", "j", "k"],
    }
)

In [None]:
# fails: column lengths are unequal
pl.DataFrame(
    data={
        "numbers": [2, 3, 4, 5, 6, 7],
        "characters": ["f", "g"],
    }
)

In [None]:
# multiple different options to create DataFrame

# here: data = list[tuple] 
df_2 = pl.DataFrame(
    data=[
        (2, "f"),
        (3, "g"),
        (4, "h"),
        (5, "i"),
        (6, "j"),
        (7, "k"),
    ],
    schema=["numbers", "characters"],
    orient="row",
)
df_2

In [None]:
# here:  data = list[pl.Series]
s1 = pl.Series("numbers", [2, 3, 4, 5, 6, 7])
s2 = pl.Series("characters", ["f", "g", "h", "i", "j", "k"])

df_3 = pl.DataFrame([s1, s2])
df_3

## Fake Dataset

In [None]:
import datetime as dt
import random
random.seed(42)

def create_coffee_sales_data(n_rows:int=1000) -> list[tuple]:
    # define base data
    drinks = ["cappuccino", "water", "espresso", "tea", "lemonade"]
    prices = {"cappuccino": 3.5, "water": 2.0, "espresso": 3.0, "tea": 2.5, "lemonade": 3.0}

    # define time range
    start = dt.datetime(2025, 8, 1)
    end = dt.datetime(2025, 9, 30)
    delta_days = (end - start).days

    # generate rows of fake data
    data = [] # container
    for _ in range(n_rows):
        # pick a random drink
        drink = random.choice(drinks) 

        # retrieve correct price
        price = prices[drink]

        # random date after start
        timestamp = start + dt.timedelta(
            days=random.randint(0, delta_days)  
        )

        # add row to data
        data.append((drink, price, timestamp))

    return data

In [None]:
data = create_coffee_sales_data(n_rows=1_000)
data[0]

In [None]:
# create a Polars DataFrame
coffee_sales = pl.DataFrame(data, schema=["drink", "price", "timestamp"], orient='row')

# simple return: shows start and end of dataset
coffee_sales

## Dataset Exploration

In [None]:
# polars datatypes
# full overview:
# https://docs.pola.rs/user-guide/concepts/data-types-and-structures/#data-types-internals
coffee_sales

In [None]:
# show the first few rows
# similar to pandas
first_elements = coffee_sales.head()
first_elements

In [None]:
# show the last 3 rows
last_elements = coffee_sales.tail(3)
last_elements

In [None]:
# handy one-liner useful for many columns in REPL 
coffee_sales.glimpse(max_items_per_column=2)

DataFrame attributes

In [None]:
coffee_sales.columns

In [None]:
coffee_sales.dtypes

In [None]:
schema = coffee_sales.schema
schema

In [None]:
# access data type for column
schema["drink"]

In [None]:
# (rows, cols)
coffee_sales.shape

In [None]:
# most powerful insight -> basic statistics
# similar to pandas
coffee_sales.describe()

## Read & Write (I/O)

In [None]:
coffee_sales

In [None]:
# write csv file
csv_path =  "coffee_sales.csv"
coffee_sales.write_csv(csv_path)

In [None]:
# read csv
csv_read = pl.read_csv(csv_path)
csv_read

In [None]:
# write parquet file
parquet_path = "coffee_sales.parquet"
coffee_sales.write_parquet(file=parquet_path)

In [None]:
# read parquet file
parquet_read = pl.read_parquet(parquet_path)
parquet_read

## I/O Pitfalls

In [None]:
# check that files are equal
from polars.testing import assert_frame_equal

assert_frame_equal(csv_read, parquet_read)

In [None]:
print(csv_read.head(2))
print(parquet_read.head(2))

In [None]:
# Tip: if possible use parquet files
# + maintain schema 
# + save disk space 

In [None]:
# csv files loose schema information
# we can explicitly enforce it during reading
csv_corrected = pl.read_csv(
    csv_path,
    schema={"drink": pl.String, "price": pl.Float64, "timestamp": pl.Datetime()},
)

assert_frame_equal(csv_corrected, parquet_read)
csv_corrected.head(2)


## Wrap up

In [None]:
print("Done - Happy Coding!")