In [7]:
import pandas as pd
import polars as pl
from pathlib import Path
import os.path

## multiply df

In [8]:
def read_data(lib, datafile, dataset, *args, **kwargs):
    df = None
    if not Path(datafile).exists():
        print(f"[Error] read_data(): datafile {datafile} not found")
        return df

    if datafile.endswith("csv") or datafile.endswith("csv.gz"):
        if lib == "pandas":
            df = pd.read_csv(datafile)
        elif lib == "polars":
            df = pl.read_csv(datafile)
    elif datafile.endswith("parquet"):
        if lib == "pandas":
            df = pd.read_parquet(datafile, engine='pyarrow')
        elif lib == "polars":
            df = pl.read_parquet(datafile)
    return df

In [42]:
def df_multiply_by_n(lib, df, n_factor):
    """
    Expand df rows by n_factor
    """
    if lib not in ["pandas", "polars"] or df is None:
        return None
    
    df_list = []
    for i in range(n_factor):
        df_list.append(df)
    if lib == "pandas":
        return pd.concat(df_list, ignore_index=True)
    elif lib == "polars":
        return pl.concat(df_list)

In [10]:
file_in = "../data/uber-ride/polars/train.parquet"

In [11]:
df = read_data("pandas", datafile=file_in, dataset="")

In [12]:
df.shape

(701732, 11)

In [29]:
df_new = df_multiply_by_n("pandas", df, n_factor=2)

In [30]:
df_new.shape

(1403464, 11)

In [31]:
df_new.tail()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
1403459,id2319974,1,2016-03-19 20:36:59,2016-03-19 20:45:33,1,-73.97049,40.764458,-73.966064,40.754681,N,514
1403460,id1646709,2,2016-05-01 20:21:02,2016-05-01 20:34:51,1,-73.981949,40.749031,-73.984261,40.726391,N,829
1403461,id1383104,1,2016-02-15 15:51:34,2016-02-15 15:54:56,1,-73.97673,40.775352,-73.980103,40.780758,N,202
1403462,id2480124,2,2016-02-22 03:05:02,2016-02-22 03:12:44,1,-73.98877,40.722439,-74.008186,40.711529,N,462
1403463,id3026337,1,2016-04-17 17:00:03,2016-04-17 17:07:09,1,-73.98259,40.782562,-73.970024,40.799515,N,426


In [43]:
lib = "polars"
df = read_data(lib, datafile=file_in, dataset="")
df_tmp = df_multiply_by_n(lib, df, n_factor=2)

df_tmp.shape, df.shape

((1403464, 11), (701732, 11))

In [44]:
lib = "pandas"
df = read_data(lib, datafile=file_in, dataset="")
df_tmp = df_multiply_by_n(lib, df, n_factor=2)

df_tmp.shape, df.shape

((1403464, 11), (701732, 11))

In [34]:
df1 = pl.DataFrame({"a": [1], "b": [3]})
df2 = pl.DataFrame({"a": [2], "b": [4]})
df12 = pl.concat([df1, df2])

In [35]:
df12

a,b
i64,i64
1,3
2,4


In [27]:
df1_copy = df1.copy()

AttributeError: 'DataFrame' object has no attribute 'copy'

## print_results_table

In [1]:
s = "../data/uber-ride/polars/train.parquet"
len(s)

38

In [1]:
s = "../data/uber-ride/polars/train.parquet"
len(s)

38

In [17]:
def pad_str(s, width=20, align="center", pad_ch=' '):
    len_s = len(s)
    if len_s > width:
        return s[:width]
    
    pad = (width-len_s)*pad_ch
    if align == "center":
        left = int((width - len_s)/2)*pad_ch
        right = (width - len(left) - len_s)*pad_ch
        s2 = f"{left}{s}{right}"
    elif align == "right":
        s2 = f"{pad}{s}"
    else:
        s2 = f"{s}{pad}"
    return s2

In [21]:
x = pad_str("hello", 15)
print("'" + x + "'")

x = pad_str("hello", 15, align="left")
print("'" + x + "'")

x = pad_str("hello", 15, align="right")
print("'" + x + "'")

'     hello     '
'hello          '
'          hello'


In [22]:
x = pad_str("hello", 15, pad_ch='=')
print("'" + x + "'")

x = pad_str("hello", 15, pad_ch='=', align="left")
print("'" + x + "'")

x = pad_str("hello", 15, pad_ch='=', align="right")
print("'" + x + "'")

'=====hello====='


In [23]:
COL_WIDTH = {
    "pandas": 15,
    "polars": 15,
    "use-case": 52,
    "datafile": 50,
    "dataset": 15,
}

In [24]:
fmt_strs = []
for c in COL_WIDTH.keys():
    fmt_strs.append(pad_str(c, width=COL_WIDTH[c], align="center", pad_ch=' '))
print(" | ".join(fmt_strs))

    pandas      |     polars      |                       use-case                       |                      datafile                      |     dataset    


In [25]:
fmt_strs = []
for c in COL_WIDTH.keys():
    fmt_strs.append(pad_str("=", width=COL_WIDTH[c], align="center", pad_ch='='))
print(" | ".join(fmt_strs))



In [27]:
fmt_strs = []
for c in COL_WIDTH.keys():
    fmt_strs.append(pad_str(c, width=COL_WIDTH[c], align="left", pad_ch=' '))
print(" | ".join(fmt_strs))

pandas          | polars          | use-case                                             | datafile                                           | dataset        
