In [18]:
import polars as pl
import pandas as pd
import time
import numpy as np

In [3]:
start = time.time()
df = pl.read_csv('data/train.csv')
end = time.time()
print(f'Polars {end - start} seconds: {df.estimated_size('mb')}')

Polars 2.0193538665771484 seconds: 6987.138075828552


In [9]:
start = time.time()
df = pd.read_csv('data/train.csv')
end = time.time()
print(f'Pandas {end - start} seconds: {df.memory_usage(index=True).sum() / (1024*1024)}')

Pandas 24.390254974365234 seconds: 7723.2615394592285


In [17]:
dtypes = {
    'row_id': pl.Int64,
    'timestamp': pl.Int64,
    'user_id': pl.Int64,
    'content_id': pl.Int64,
    'content_type_id': pl.Int8,
    'task_container_id': pl.Int64,
    'user_answer': pl.Int8,
    'answered_correctly': pl.Int8,
    'prior_question_elapsed_time': pl.Float64,
    'prior_question_had_explanation': pl.Boolean
}

start = time.time()
df = pl.read_csv('data/train.csv', schema_overrides=dtypes)
end = time.time()

print(f'Polars {end - start} seconds: {df.estimated_size('mb')}')

Polars 1.9500019550323486 seconds: 4959.78195476532


In [20]:
dtypes = {
    'row_id': np.int64,
    'timestamp': np.int64,
    'user_id': np.int64,
    'content_id': np.int64,
    'content_type_id': np.int64,
    'task_container_id': np.int64,
    'user_answer': np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float64,
    'prior_question_had_explanation': pd.BooleanDtype
}

start = time.time()
df = pd.read_csv('data/train.csv', dtype=dtypes)
end = time.time()

print(f'Pandas {end - start} seconds: {df.memory_usage(index=True).sum() / (1024*1024)}')

  df = pd.read_csv('data/train.csv', dtype=dtypes)
  df = pd.read_csv('data/train.csv', dtype=dtypes)


Pandas 46.74800515174866 seconds: 5792.446186065674


In [21]:
def reduce_memory_usage_pl(df, name):
    """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
        Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
    print(f"Memory usage of dataframe {name} is {round(df.estimated_size('mb'), 2)} MB")
    Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
    Numeric_Float_types = [pl.Float32,pl.Float64]    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in Numeric_Int_types:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                   df = df.with_columns(df[col].cast(pl.Int64))
        elif col_type in Numeric_Float_types:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    print(f"Memory usage of dataframe {name} became {round(df.estimated_size('mb'), 2)} MB")
    return df

df = pl.read_csv('data/train.csv')

df = reduce_memory_usage_pl(df, 'df')

Memory usage of dataframe df is 6987.14 MB
Memory usage of dataframe df became 2642.8 MB


In [23]:
df.schema

Schema([('row_id', Int32),
        ('timestamp', Int64),
        ('user_id', Int32),
        ('content_id', Int16),
        ('content_type_id', Int8),
        ('task_container_id', Int16),
        ('user_answer', Int8),
        ('answered_correctly', Int8),
        ('prior_question_elapsed_time', Float32),
        ('prior_question_had_explanation', Boolean)])

In [24]:
df = pl.read_csv('data/train.csv')
df.schema

Schema([('row_id', Int64),
        ('timestamp', Int64),
        ('user_id', Int64),
        ('content_id', Int64),
        ('content_type_id', Int64),
        ('task_container_id', Int64),
        ('user_answer', Int64),
        ('answered_correctly', Int64),
        ('prior_question_elapsed_time', Float64),
        ('prior_question_had_explanation', Boolean)])