In [2]:
import pandas as pd
import polars as pl
import numpy as np
import random
from datetime import datetime, timedelta

def random_dates(start, end, n):
    start_u = start.value//10**9
    end_u = end.value//10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

def generate_dataframe(num_rows):
    # Generate data
    int_data = np.random.randint(1, 100, size=num_rows)
    float_data = np.random.uniform(1.0, 100.0, size=num_rows)
    string_data = [''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=5)) for _ in range(num_rows)]
    datetime_data = random_dates(pd.to_datetime('2020-01-01'), pd.to_datetime('2023-01-01'), num_rows)
    boolean_data = np.random.choice([True, False], size=num_rows)
    category_data = pd.Categorical(np.random.choice(['A', 'B', 'C', 'D'], size=num_rows))
    complex_string_data = ['user_' + ''.join(random.choices('0123456789', k=4)) for _ in range(num_rows)]
    # Another integer column
    another_int_data = np.random.randint(100, 200, size=num_rows)
    # Another float column with a different range
    another_float_data = np.random.uniform(100.0, 200.0, size=num_rows)

    # Create DataFrame
    df = pd.DataFrame({
        'integers': int_data,
        'floats': float_data,
        'strings': string_data,
        'datetimes': datetime_data,
        'booleans': boolean_data,
        'categories': category_data,
        'complex_strings': complex_string_data,
        'more_integers': another_int_data,
        'more_floats': another_float_data,
    })
    
    return df

# Example usage
df = generate_dataframe(1_000_000)


In [3]:
subset_df = df.sample(n=10000).filter(items = ['complex_strings'])

In [4]:
df_polars = pl.from_pandas(df)
subset_df_polars = pl.from_pandas(subset_df)

In [6]:
output = (
    df_polars
    .with_columns(
        [pl.col('datetimes').dt.year().alias('year'),
         pl.col('datetimes').dt.month().alias('month')]
    )
    .filter(pl.col('year') == 2022)
    .join(subset_df_polars, on='complex_strings', how='inner')
    .group_by('categories')
    .agg(
        pl.col('integers').sum().alias('sum_integers'),
        pl.col('floats').mean().alias('mean_floats'),
    )
    .filter(pl.col('mean_floats') == pl.col('mean_floats').max())
    .select(['categories'])
)


categories
cat
"""A"""
