In [24]:
import pandas as pd

# CSVデータの読み込み
data = pd.read_csv("race_data_1_10.csv")

# DataFrameの作成
df = pd.DataFrame(data)

# Codeが四文字のデータを削除
df = df[~df['Code'].astype(str).str.match(r'^\d{4}$')]

# Horse Weightを変形
df[['Weight', 'Weight Change']] = df['Horse Weight'].str.extract(r'(\d+)\s*\(\s*([+-]?\d+)\s*\)')
df['Weight'] = df['Weight'].fillna(0)
df['Weight'] = df['Weight'].astype(int)
df['Weight Change'] = df['Weight Change'].fillna(0)
df['Weight Change'] = df['Weight Change'].astype(int)
df.drop(columns=['Horse Weight'], inplace=True)

In [25]:
# 「Sex/Age」列を「Sex」と「Age」に分ける
df['Sex'] = df['Sex/Age'].str[0].map({'牡': 0, '牝': 1}).astype('Int64')  
df['Age'] = pd.to_numeric(df['Sex/Age'].str[1:], errors='coerce').astype('Int64') # errors='coerce'でエラーをNaNに変換
df.drop(columns=['Sex/Age'], inplace=True)

In [26]:
# 時間を分表記から秒表記に変更
def convert_to_second(time_str):
    if isinstance(time_str, str):
        minutes, seconds = map(float, time_str.split(':'))
        return minutes * 60 + seconds
    return time_str  
df['Time_x'] = df['Time_x'].apply(convert_to_second)

In [27]:
# Distanceを分離
df['Ground'] = df['Distance'].str[0]
df['Distance'] = df['Distance'].str[1:]

# Ground, Condition, Weatherをダミー変数に変換
dummy = pd.get_dummies(df[['Ground', 'Condition', 'Weather']])
dummy

# データフレームを結合し、要らないデータを破棄
df = pd.concat([df, dummy], axis=1)
df.drop(columns=['Ground', 'Condition', 'Weather'], inplace=True)
df

# first_chars = df['Condition'].dropna().apply(lambda x: str(x)[0])  # NaNを除外し、先頭文字を取得
# unique_first_chars = first_chars.unique()  # ユニークな文字を取得

# # 結果を表示
# print("0インデックス目のユニークな文字:", list(unique_first_chars))

Unnamed: 0,Code,Rank,Frame Rank,Horse Number,Horse Name,Kinryou,Jockey_x,Time_x,Chakusa,Tsuuka,...,Track,Race Number,Distance,Weight,Weight Change,Sex,Age,Ground_ダ,Condition_良,Weather_晴
0,202436093010,1,8.0,10,ボウトロイ,56.0,岩本怜,101.0,,2-2-2-2,...,水沢,10.0,1600,519,3,0,6,True,True,True
1,202436093010,2,1.0,1,ノーブルサターン,59.0,高松亮,101.0,ハナ,3-3-3-3,...,水沢,10.0,1600,541,3,0,10,True,True,True
2,202436093010,3,8.0,11,マイネルアストリア,54.0,関本玲花,101.5,3,1-1-1-1,...,水沢,10.0,1600,551,-8,0,7,True,True,True
3,202436093010,4,5.0,5,トーセンマッシモ,56.0,塚本涼人,102.0,3,8-10-7-5,...,水沢,10.0,1600,493,1,,6,True,True,True
4,202436093010,5,4.0,4,フレイムウィングス,56.0,山本政聡,102.4,2.1/2,8-7-6-7,...,水沢,10.0,1600,481,5,,7,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,202436093011,1,2.0,2,ソルエストレーラ,54.0,菅原辰徳,90.5,,1-1-1-1,...,水沢,11.0,1400,428,-6,1,4,True,True,True
94,202436093011,2,7.0,8,モンゴリアンキング,56.0,陶文峰,90.8,1.3/4,7-6-4-4,...,水沢,11.0,1400,497,6,,5,True,True,True
95,202436093011,3,8.0,11,テリオスドン,56.0,山本聡哉,91.1,2,4-4-5-6,...,水沢,11.0,1400,500,-3,0,4,True,True,True
96,202436093011,4,4.0,4,シャークアタック,56.0,山本政聡,91.4,1.1/2,5-4-5-5,...,水沢,11.0,1400,496,2,0,5,True,True,True


In [28]:
df.to_csv('updated_horse_data.csv', index=False)