In [1]:
import pandas as pd

In [2]:
%run 0.1_Variables-and-Constants.ipynb

In [3]:
def result_df(t_start="2015", t_end="2025", t_location=location_all):
    csv_dir = Path(result_dir)
    csv_files = csv_dir.glob("**/*.CSV")

    # 空のDataFrameを作成
    df = pd.DataFrame()

    # CSVファイルごとに処理
    for file in csv_files:
        year = "20" + file.stem[0:2]
        held = file.stem[2]
        location = file.stem[3]
        if (
            int(year) >= int(t_start)
            and int(year) <= int(t_end)
            and location in t_location
        ):
            # 最適化: chunksizeを指定してメモリ効率良く処理
            chunk_iter = pd.read_csv(
                file,
                chunksize=10000,
                encoding="cp932",
                header=None,
                names=result_columns,
            )
            for chunk in chunk_iter:
                df = pd.concat([df, chunk], ignore_index=True)
    ## git-lfs 対策
    # df = df[
    #    ~df.apply(
    #        lambda row: row.astype(str).str.contains("git-lfs", case=False).any(),
    #        axis=1,
    #    )
    # ]
    # 扱い易いようにデータの前処理
    # 年、月、日からdateカラム（YYMMDD）を生成
    df["レース日"] = (
        df["年"].astype(str).str.zfill(2)
        + df["月"].astype(str).str.zfill(2)
        + df["日"].astype(str).str.zfill(2)
    )
    # dateカラムをdatetime型に変換
    df["レース日"] = pd.to_datetime(df["レース日"], format="%y%m%d", yearfirst=True)

    # 生年月日カラムをdatetime型に変換
    # df["生年月日"] = pd.to_datetime(df["生年月日"], format="%y%m%d", yearfirst=True)

    # 不要なカラムを削除
    df.drop(columns=["馬印", "レース印"], inplace=True)  # リストで渡す
    return df

In [4]:
def time_df(t_start="2015", t_end="2025", t_location=location_all):
    csv_dir = Path(time_dir)
    csv_files = csv_dir.glob("**/*.CSV")

    # 空のDataFrameを作成
    df = pd.DataFrame()

    # CSVファイルごとに処理
    for file in csv_files:
        year = "20" + file.stem[0:2]
        held = file.stem[2]
        location = file.stem[3]
        if (
            int(year) >= int(t_start)
            and int(year) <= int(t_end)
            and location in t_location
        ):
            # 最適化: chunksizeを指定してメモリ効率良く処理
            chunk_iter = pd.read_csv(
                file,
                chunksize=10000,
                encoding="cp932",
                header=None,
                names=time_columns,
            )
            for chunk in chunk_iter:
                df = pd.concat([df, chunk], ignore_index=True)
    pattern = r"(?P<min>\d)\.(?P<sec>\d{2})\.(?P<tenth>\d)"
    out = df["基準タイム"].str.extract(pattern).astype(int)
    df["基準タイム(秒)"] = out["min"] * 60 + out["sec"] + out["tenth"] / 10
    return df