In [1]:
# setup
from IPython.display import display, HTML
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

data_path = Path("data")


<html>
<head>
</head>
<body style="background-color: #FFFFFF;">
  <h1 align="center" style="font-weight: bold; font-style: italic; font-size: 390%;">Better Together</h1>
  <table border="0" align="center" width="100%" bgcolor="#FFFFFF">
    <tr>
      <td align="center" width="50%" bgcolor="#FFFFFF">
        <img src="images/pandas_logo.1280x517.png" width="620" height="250">
      </td>
      <td align="center" width="50%" bgcolor="#FFFFFF">
        <img src="images/polars.round.400x400.png" width="250" height="250">
      </td>
    </tr>
    <tr>
      <td colspan="2" align="center" bgcolor="#FFFFFF">
        <img src="images/arrow-logo_horizontal.1800x936.png" width="481" height="250">
      </td>
    </tr>
  </table>
  <br>
  <div style="font-size: 300%;">
    <b>Better Together: Unleashing the Synergy of Pandas, Polars, and Apache Arrow</b><br>
    <b>Speaker:</b> Chris Brousseau<br>
    <b>Date:</b> 8 Oct 2023
  </div>
</body>
</html>
 </div>
</body>
</html>

## About Me:
copy from 500 AI talk


# Talk Tagline:<br>
Supercharge your data engineering workflows by merging the **robustness of Pandas** with the **high-speed capabilities of Polars**, 
all underpinned by **Apache Arrow's in-memory technology.** <br>
<br>
This technical deep-dive will unravel the nuances between Pandas and Polars, showcase their newest features, and demonstrate how to integrate them for optimal performance. 
Learn actionable techniques to make your data pipelines faster, more efficient, and ready for scale. Join us to see how you might elevate your data engineering toolkit!<br>
<br>
Key Differences and Updates: Get up to speed with the latest features and differences between Pandas and Polars. (5 min)
Introducing Apache Arrow: Discover what Arrow is and why it's a game-changer in the Python data ecosystem. (5 min)
Synergizing Pandas and Polars: Detailed walkthrough on how and why to integrate Pandas and Polars for high-efficiency data manipulation. (15 min)

<h1>Key Differences - Packages & Memory</h1>

<table border="1" style="width: 80%; font-size: 24px;">
  <thead>
    <tr style="font-weight: bold;">
      <th style="vertical-align: bottom;">Feature</th>
      <th style="text-align: center;">
        <img src="./images/pandas_secondary.svg" alt="Pandas" style="width: 300px; max-width: 100%;">
      </th>
      <th style="text-align: center;">
        <img src="./images/polars.round.400x400.png" alt="Polars" style="width: 200px; max-width: 100%;">
      </th>
    </tr>
  </thead>
  <tbody>
    <tr style="background-color: #FFFFFF;">
      <td>First Release Date</td>
      <td>2008</td>
      <td>2019</td>
    </tr>
    <tr style="background-color: #F0F0F0;">
      <td>Current Release</td>
      <td>2.1.1</td>
      <td>0.19.5</td>
    </tr>
    <tr style="background-color: #FFFFFF;">
      <td>Programming Language</td>
      <td>C, Cython, Python</td>
      <td>Rust</td>
    </tr>
    <tr style="background-color: #F0F0F0; font-weight: bold">
      <td>Memory</td>
      <td></td>
      <td></td>
    </tr>
    <tr style="background-color: #FFFFFF;">
      <td>Memory Backend</td>
      <td>Numpy or <strong>Apache Arrow</strong></td>
      <td><strong>Apache Arrow</strong></td>
    </tr>
    <tr style="background-color: #F0F0F0;">
      <td>Memory implementation</td>
      <td>Pyarrow (C++ wrapper on data)</td>
      <td>Arrow2 (Rust wrapper on data)</td>
    </tr>
    <tr style="background-color: #FFFFFF;">
      <td>Use Index/Multindex</td>
      <td>Yes</td>
      <td><strong>No</strong></td>
    </tr>
    <tr style="background-color: #F0F0F0;">
      <td>Larger-than-Memory Support</td>
      <td>No  (but via Dask)</td>
      <td><strong>Native</strong></td>
    </tr>
    <tr style="background-color: #FFFFFF;">
      <td>Represent Missing Data</td>
      <td>"Nan" or "None"</td>
      <td>"null"</td>
    </tr>
  </tbody>
</table>


<h1>Key Differences and Updates - API</h1>

<table border="1" style="width: 80%; font-size: 24px;">
  <thead>
    <tr style="font-weight: bold;">
      <th style="vertical-align: bottom;">Feature</th>
      <th style="text-align: center;">
        <img src="./images/pandas_secondary.svg" alt="Pandas" style="width: 300px; max-width: 100%;">
      </th>
      <th style="text-align: center;">
        <img src="./images/polars.round.400x400.png" alt="Polars" style="width: 200px; max-width: 100%;">
      </th>
    </tr>
  </thead>
  <tbody>
    <tr style="background-color: #FFFFFF; font-weight: bold;">
      <td>API</td>
      <td></td>
      <td></td>
    </tr>
    <tr style="background-color: #F0F0F0;">
      <td>Number of Methods</td>
      <td><strong><i>TBD</i></strong></td>
      <td><strong><i>TBD</i></strong></td>
    </tr>
    <tr style="background-color: #FFFFFF;">
      <td>API mode</td>
      <td>Eager</td>
      <td>Eager or Lazy</td>
    </tr>
    <tr style="background-color: #F0F0F0;">
      <td>Query Optimization</td>
      <td>No</td>
      <td>Yes - with Lazy</td>
    </tr>
    <tr style="background-color: #FFFFFF;">
      <td>Multithreading</td>
      <td>Limited</td>
      <td><strong>Most operations</td>
    </tr>
    <tr style="background-color: #F0F0F0; font-weight: bold;">
      <td>How to transfer df?</td>
      <td></td>
      <td>polars.from_pandas</td>
    </tr>
    <tr style="background-color: #FFFFFF; font-weight: bold;">
      <td></td>
      <td></td>
      <td>polars.to_pandas</td>
    </tr>
  </tbody>
</table>


<h1>What is Apache Arrow and Why is it a Game Changer?</h1>

<p style="font-size: 18px; line-height: 1.5;">
  <strong>Speed ==> Faster</strong> - in-memory columnar format; zero-copy reads avoid costly serialization and deserialization ==> transfer pointers + metadata vs. copying data<br>
  <strong>Interoperability ==> Easier</strong> - arrow is program independent<br>
  <strong>Datatypes ==> More + Better</strong> - improvement from numpy; Missing data support (NA) for all data types<br>
</p>

<table border="1" style="width: 80%; font-size: 18px;">
  <thead>
    <tr style="font-weight: bold;">
      <th style="text-align: center; padding: 20px;">
        <img src="./images/arrow_simd.948x651.png" alt="in-memory columnar format" style="width: 400px; margin-right: 20px;">
      </th>
      <th style="text-align: center; padding: 20px;">
        <img src="./images/arrow_copy.574x318.png" alt="zero-copy reads" style="width: 400px; margin-left: 20px;">
      </th>
    </tr>
  </thead>
  <tbody>
    <tr style="background-color: #FFFFFF; font-weight: bold;">
      <td></td>
      <td></td>
    </tr>
  </tbody>
</table>

<p style="font-size: 18px; font-weight: bold;">
  <strong>Image source: Apache Foundation</strong><br>
  <a href="https://arrow.apache.org/overview/">Apache Arrow Overview</a> <br>
  <a href="https://arrow.apache.org/docs/format/Columnar.html">Columnar Format</a><br>
  <a href="https://pandas.pydata.org/docs/user_guide/pyarrow.html">Pandas PyArrow</a><br>
</p>


# Key Differences and Updates
features
performance

4 views:  pandas numpy, pandas arrow, polars, polars-lazy eval

# TBD - python dataframe interchange protocol

# How to integrate them for optimal performance

# Nuances - Get the Best out of Pandas and Polars

setup tips - Pandas
### enable Copy-On-Write
pd.options.mode.copy_on_write = True

### set pyarrow for all string data
pd.options.future.infer_string = True  

### use pyarrow for I/O
pd.read_csv( my_datat.csv",  engine="pyarrow"   dtype_backend="pyarrow")
pd.read_parquet,
)

# Recap

# Thank you!

# Reference Material

pandas
https://pandas.pydata.org/docs/whatsnew/index.html

polars
https://pola-rs.github.io/polars/

apache arrow
https://arrow.apache.org/overview/

pyarrrow
https://arrow.apache.org/docs/python/index.html

dataframe API standard
https://data-apis.org/dataframe-api/draft/index.html
https://ponder.io/how-the-python-dataframe-interchange-protocol-makes-life-better/

Arrow Revolution
https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i

# convert to/from pandas/pyarrow
https://arrow.apache.org/docs/python/pandas.html

# syntax examples
https://www.rhosignal.com/posts/polars-pandas-cheatsheet/
