In [1]:
import tomica
print(tomica.__file__)

/Users/zhangyangfa/rust-projects/tomica/tomica/__init__.py


## Usage

In [2]:
import tomica
import polars as pl

df = pl.read_csv('./data/csidata.csv')
print(df)

shape: (1_240_545, 11)
┌────────────┬────────────┬───────┬───────┬───┬────────┬─────────┬─────────────┬───────────┐
│ datetime   ┆ instrument ┆ open  ┆ high  ┆ … ┆ change ┆ pct_chg ┆ amount      ┆ volume    │
│ ---        ┆ ---        ┆ ---   ┆ ---   ┆   ┆ ---    ┆ ---     ┆ ---         ┆ ---       │
│ str        ┆ str        ┆ f64   ┆ f64   ┆   ┆ f64    ┆ f64     ┆ f64         ┆ f64       │
╞════════════╪════════════╪═══════╪═══════╪═══╪════════╪═════════╪═════════════╪═══════════╡
│ 2006-01-04 ┆ 000001.SZ  ┆ 6.13  ┆ 6.33  ┆ … ┆ 0.14   ┆ 2.28    ┆ 96989.0066  ┆ 154450.68 │
│ 2006-01-04 ┆ 000002.SZ  ┆ 4.4   ┆ 4.67  ┆ … ┆ 0.35   ┆ 8.12    ┆ 175187.0657 ┆ 389310.43 │
│ 2006-01-04 ┆ 000012.SZ  ┆ 4.7   ┆ 4.75  ┆ … ┆ 0.08   ┆ 1.73    ┆ 18863.8625  ┆ 40399.61  │
│ 2006-01-04 ┆ 000016.SZ  ┆ 3.52  ┆ 3.58  ┆ … ┆ 0.05   ┆ 1.42    ┆ 5131.8793   ┆ 14426.65  │
│ 2006-01-04 ┆ 000021.SZ  ┆ 7.99  ┆ 8.4   ┆ … ┆ 0.32   ┆ 4.0     ┆ 27657.1708  ┆ 33526.6   │
│ …          ┆ …          ┆ …     ┆ …     ┆ … ┆

In [3]:
# 方法1：使用 map_groups 返回 DataFrame
def calculate_factors(group_df):
      return group_df.with_columns([
          tomica.operators.ts_returns(group_df["close"], 10).alias("return_10d"),
          tomica.operators.ts_std(group_df["close"], 20).alias("volatility_20d"),
          tomica.operators.ts_mean(group_df["close"], 5).alias("ma_5d"),
          tomica.operators.ts_sharpe(group_df["close"], 10).alias("sharpe_10d"),
          tomica.operators.ts_zscore(group_df["close"], 20).alias("zscore_20d")
      ])

result = df.group_by("instrument", maintain_order=True).map_groups(calculate_factors)
print(result)

# 方法2：直接对整个列操作（如果不需要分组）
# 注意：这会把所有股票的数据当作一个序列处理
simple_result = df.with_columns([
    tomica.operators.ts_returns(df["close"], 10).alias("return_10d"),
    tomica.operators.ts_std(df["close"], 20).alias("volatility_20d"),
])
print(simple_result)

shape: (1_240_545, 16)
┌────────────┬────────────┬────────┬────────┬───┬─────────────┬──────────┬────────────┬────────────┐
│ datetime   ┆ instrument ┆ open   ┆ high   ┆ … ┆ volatility_ ┆ ma_5d    ┆ sharpe_10d ┆ zscore_20d │
│ ---        ┆ ---        ┆ ---    ┆ ---    ┆   ┆ 20d         ┆ ---      ┆ ---        ┆ ---        │
│ str        ┆ str        ┆ f64    ┆ f64    ┆   ┆ ---         ┆ f64      ┆ f64        ┆ f64        │
│            ┆            ┆        ┆        ┆   ┆ f64         ┆          ┆            ┆            │
╞════════════╪════════════╪════════╪════════╪═══╪═════════════╪══════════╪════════════╪════════════╡
│ 2006-01-04 ┆ 000001.SZ  ┆ 6.13   ┆ 6.33   ┆ … ┆ null        ┆ 6.28     ┆ null       ┆ null       │
│ 2006-01-05 ┆ 000001.SZ  ┆ 6.3    ┆ 6.35   ┆ … ┆ 0.028284    ┆ 6.3      ┆ 222.738636 ┆ 0.707107   │
│ 2006-01-06 ┆ 000001.SZ  ┆ 6.4    ┆ 6.5    ┆ … ┆ 0.066583    ┆ 6.336667 ┆ 95.169036  ┆ 1.101378   │
│ 2006-01-09 ┆ 000001.SZ  ┆ 6.4    ┆ 6.44   ┆ … ┆ 0.060553    ┆ 6.35

## test speed

In [4]:
import polars as pl
import random
import time
from datetime import datetime

In [5]:
data = pl.select(
    pl.lit(1).sample(n=1e6, with_replacement=True)  # 先采样
    .map_elements(lambda _: random.uniform(-50, 50), return_dtype=pl.Float64)
    .alias("foo")
).to_series()

In [8]:
def test_speed(fn, window_size):
    st = time.time()
    try:
        fn(data, window_size)
        print(f'{fn.__name__:<20} {time.time()-st:>8.4f} s')
    except:
        try:
            fn(data, data, window_size)
            print(f'{fn.__name__:<20} {time.time()-st:>8.4f} s')
        except:
            try:
                fn(data,data,window_size,True)
                print(f'{fn.__name__:<20} {time.time()-st:>8.4f} s')
            except:
                fn(data,data,window_size,0)
                print(f'{fn.__name__:<20} {time.time()-st:>8.4f} s')

In [9]:
for func in tomica.operators.__all__:
    func = getattr(tomica.operators,func)
    test_speed(func,10)

ts_delay               0.0008 s
ts_delta               0.0016 s
ts_returns             0.0025 s
ts_sum                 0.0168 s
ts_product             0.0130 s
ts_max                 0.0159 s
ts_min                 0.0142 s
ts_mean                0.0094 s
ts_std                 0.0178 s
ts_rank                0.0906 s
ts_variance            0.0159 s
ts_quantile_up         0.0303 s
ts_quantile_down       0.0291 s
ts_zscore              0.0270 s
ts_robust_zscore       0.0732 s
ts_scale               0.0280 s
ts_sharpe              0.0280 s
ts_av_diff             0.0100 s
ts_min_diff            0.0140 s
ts_max_diff            0.0143 s
ts_cov                 0.0292 s
ts_beta                0.0488 s
ts_corr                0.2632 s
ts_pos_count           0.0046 s
ts_neg_count           0.0047 s
decay_n                0.0033 s


## compare mine with py-polars

In [None]:
st = time.time()
df['open'].rolling_rank(10)
print(f'{time.time()-st:.4f} s')

0.1004 s


In [None]:
st = time.time()
tomica.operators.ts_rank(df['open'],10)
print(f'{time.time()-st:.4f} s')

0.0935 s
