In [22]:
import os
cwd = os.getcwd()
print("Current working directory:", cwd)

Current working directory: /home/mortonkuo/morton_progress/fft-regular-pattern-detection-real-world


### (1) Pandas: 2025-06-25_partial_test.csv

In [27]:
import pandas as pd

# 1. Skip the first line if it’s just "_" (otherwise pandas will think "_" is your header)
df = pd.read_csv("data/2025-06-25_partial_test.csv", skiprows=0)

# 2. Inspect dtypes & preview
print(df.dtypes)
print(df.head())

flag            int64
time            int64
match_time    float64
seq           float64
status          int64
product         int64
price         float64
quantity      float64
bid1          float64
bidv1         float64
bid2          float64
bidv2         float64
bid3          float64
bidv3         float64
bid4          float64
bidv4         float64
bid5          float64
bidv5         float64
bid_num       float64
ask1          float64
askv1         float64
ask2          float64
askv2         float64
ask3          float64
askv3         float64
ask4          float64
askv4         float64
ask5          float64
askv5         float64
ask_num       float64
dtype: object
   flag          time  match_time        seq  status  product  price  \
0     0  113609257489         NaN  5794701.0      16     2882    NaN   
1     0  113609257696         NaN  5794702.0      16     2882    NaN   
2     0  113609258676         NaN  5794703.0      16     2882    NaN   
3     0  113609259466         NaN  579

In [29]:
import pandas as pd

dtype = {
    "flag":        "Int64",
    "time":        "Int64",
    "match_time": "Float64",
    "seq":        "Float64",
    "status":      "Int64",
    "product":     "Int64",
    "price":      "Float64",
    "quantity":   "Float64",
    **{f"bid{i}":    "Float64" for i in range(1,6)},
    **{f"bidv{i}":   "Float64"   for i in range(1,6)},
    **{f"ask{i}":    "Float64" for i in range(1,6)},
    **{f"askv{i}":   "Float64"   for i in range(1,6)},
    "bid_num":    "Float64",
    "ask_num":    "Float64",
}

# 1. Skip the first line if it’s just "_" (otherwise pandas will think "_" is your header)
df = pd.read_csv("data/2025-06-25_partial_test.csv", skiprows=0, dtype=dtype)

# 2. Inspect dtypes & preview
print(df.dtypes)
print(df.head())

flag            Int64
time            Int64
match_time    Float64
seq           Float64
status          Int64
product         Int64
price         Float64
quantity      Float64
bid1          Float64
bidv1         Float64
bid2          Float64
bidv2         Float64
bid3          Float64
bidv3         Float64
bid4          Float64
bidv4         Float64
bid5          Float64
bidv5         Float64
bid_num       Float64
ask1          Float64
askv1         Float64
ask2          Float64
askv2         Float64
ask3          Float64
askv3         Float64
ask4          Float64
askv4         Float64
ask5          Float64
askv5         Float64
ask_num       Float64
dtype: object
   flag          time  match_time        seq  status  product  price  \
0     0  113609257489        <NA>  5794701.0      16     2882   <NA>   
1     0  113609257696        <NA>  5794702.0      16     2882   <NA>   
2     0  113609258676        <NA>  5794703.0      16     2882   <NA>   
3     0  113609259466        <NA>  579

In [30]:
import pandas as pd

# 1. Skip the first line if it’s just "_" (otherwise pandas will think "_" is your header)
df = pd.read_csv("data/2025-06-25_partial.csv", skiprows=0)

# 2. Inspect dtypes & preview
print(df.dtypes)
print(df.head())

0                 int64
113609257489      int64
Unnamed: 2      float64
5794701.0       float64
16                int64
2882             object
Unnamed: 6      float64
Unnamed: 7      float64
66.4            float64
437             float64
66.3            float64
258             float64
66.2            float64
377             float64
66.1            float64
350             float64
66.0            float64
861             float64
5               float64
66.5            float64
229             float64
66.6            float64
819             float64
66.7            float64
301             float64
66.8            float64
593             float64
66.9            float64
1197            float64
5.1             float64
dtype: object
   0  113609257489  Unnamed: 2  5794701.0  16  2882  Unnamed: 6  Unnamed: 7  \
0  0  113609257696         NaN  5794702.0  16  2882         NaN         NaN   
1  0  113609258676         NaN  5794703.0  16  2882         NaN         NaN   
2  0  113609259466         Na

  df = pd.read_csv("data/2025-06-25_partial.csv", skiprows=0)


### (2) Polars: 2025-06-25_partial.csv

In [45]:
import polars as pl

# Method 1: Read entire CSV into memory (eager)
# df = pl.read_csv("./data/2025-06-25_partial.csv")
# df = pl.read_csv("data/2025-06-25_partial.csv")

# Method 2: Lazy reading (recommended for large files)
# df_lazy = pl.scan_csv("data/2025-06-25_partial.csv")

# Common parameters for read_csv / scan_csv
df = pl.read_csv(
    "data/2025-06-25_partial.csv",
    separator=",",           # delimiter (default: ",")
    has_header=True,         # first row is header (default: True)
	infer_schema_length=10000,
    # columns=["col2", "col6"], # read only specific columns
    # dtypes={"col2": pl.Int64, "col6": pl.Utf8},  # specify data types
    # n_rows=100,           # read only first n rows
    skip_rows=0,            # skip first n rows
    null_values=["NA", "null", "", "2883B"],  # values to treat as null
    # ignore_errors=False,     # continue on parsing errors
	ignore_errors=True,
    try_parse_dates=True,    # automatically parse dates
    encoding="utf8",         # file encoding
    low_memory=False,        # reduce memory usage (slower)
)

# Basic operations after reading
print(df.head())            # first 5 rows

# Header
# flag,time,match_time,seq,status,product,price,quantity,bid1,bidv1,bid2,bidv2,bid3,bidv3,bid4,bidv4,bid5,bidv5,bid_num,ask1,askv1,ask2,askv2,ask3,askv3,ask4,askv4,ask5,askv5,ask_num

shape: (5, 30)
┌──────┬──────────────┬────────────┬────────────┬───┬───────┬───────┬───────┬─────────┐
│ flag ┆ time         ┆ match_time ┆ seq        ┆ … ┆ askv4 ┆ ask5  ┆ askv5 ┆ ask_num │
│ ---  ┆ ---          ┆ ---        ┆ ---        ┆   ┆ ---   ┆ ---   ┆ ---   ┆ ---     │
│ i64  ┆ i64          ┆ i64        ┆ f64        ┆   ┆ i64   ┆ f64   ┆ i64   ┆ i64     │
╞══════╪══════════════╪════════════╪════════════╪═══╪═══════╪═══════╪═══════╪═════════╡
│ 0    ┆ 113609257489 ┆ null       ┆ 5.794701e6 ┆ … ┆ 593   ┆ 66.9  ┆ 1197  ┆ 5       │
│ 0    ┆ 113609257696 ┆ null       ┆ 5.794702e6 ┆ … ┆ 593   ┆ 66.9  ┆ 1197  ┆ 5       │
│ 0    ┆ 113609258676 ┆ null       ┆ 5.794703e6 ┆ … ┆ 593   ┆ 66.9  ┆ 1197  ┆ 5       │
│ 0    ┆ 113609259466 ┆ null       ┆ 5.794704e6 ┆ … ┆ 593   ┆ 66.9  ┆ 1197  ┆ 5       │
│ 0    ┆ 113609260828 ┆ null       ┆ 1694313.1  ┆ … ┆ 99    ┆ 182.5 ┆ 23    ┆ 5       │
└──────┴──────────────┴────────────┴────────────┴───┴───────┴───────┴───────┴─────────┘


In [50]:
pl.Config.set_tbl_cols(100)  # Show up to 100 columns

pl.Config(
    tbl_cols=-1,           # Show all columns
    tbl_rows=20,           # Show more rows
    fmt_str_lengths=100,   # Show longer strings
    tbl_width_chars=None   # No width limit
)

# Filter for product=2882
df_2882 = df.filter(pl.col("product") == "2882")

# Display results
print(f"\nTotal rows with product=2882: {len(df_2882)}")
print(f"Shape of filtered data: {df_2882.shape}")
print("\nFirst 5 rows of product=2882:")
print(df_2882.head(20))


Total rows with product=2882: 20688
Shape of filtered data: (20688, 30)

First 5 rows of product=2882:
shape: (20, 30)
┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐
│ fla ┆ tim ┆ mat ┆ seq ┆ sta ┆ pro ┆ pri ┆ qua ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ bid ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask ┆ ask │
│ g   ┆ e   ┆ ch_ ┆ --- ┆ tus ┆ duc ┆ ce  ┆ nti ┆ 1   ┆ v1  ┆ 2   ┆ v2  ┆ 3   ┆ v3  ┆ 4   ┆ v4  ┆ 5   ┆ v5  ┆ _nu ┆ 1   ┆ v1  ┆ 2   ┆ v2  ┆ 3   ┆ v3  ┆ 4   ┆ v4  ┆ 5   ┆ v5  ┆ _nu │
│ --- ┆ --- ┆ tim ┆ f64 ┆ --- ┆ t   ┆ --- ┆ ty  ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ m   ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ m   │
│ i64 ┆ i64 ┆ e   ┆     ┆ i64 ┆ --- ┆ f64 ┆ --- ┆ f64 ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ i64 ┆ --- ┆ f64 ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ i64 ┆ 