In [None]:
import pandas as pd

# CSVデータの読み込み
data = pd.read_csv("original_data/merged_data.csv")

# DataFrameの作成
df = pd.DataFrame(data)

In [None]:
# Horse Weightを変形
df[['Weight', 'Weight Change']] = df['Horse Weight'].str.extract(r'(\d+)\s*\(\s*([+-]?\d+)\s*\)')
df['Weight'] = df['Weight'].fillna(0)
df['Weight'] = df['Weight'].astype(int)
df['Weight Change'] = df['Weight Change'].fillna(0)
df['Weight Change'] = df['Weight Change'].astype(int)
df.drop(columns=['Horse Weight'], inplace=True)

In [None]:
# 「Sex/Age」列を「Sex」と「Age」に分ける
df['Sex'] = df['Sex/Age'].str[0].map({'牡': 0, '牝': 1, 'セ' : 0}).astype('Int64')  
df['Age'] = pd.to_numeric(df['Sex/Age'].str[1:], errors='coerce').astype('Int64') # errors='coerce'でエラーをNaNに変換
df.drop(columns=['Sex/Age'], inplace=True)

In [None]:
def convert_to_second(time_str):
    # time_strが文字列の場合にのみ処理
    if isinstance(time_str, str):
        try:
            # "分:秒" の形式で分と秒を分解して、秒に変換
            minutes, seconds = map(float, time_str.split(':'))
            return minutes * 60 + seconds
        except ValueError:
            # 時間のフォーマットが間違っている場合はNoneを返すか、0秒などを返す
            return 0
    # 文字列でない場合（NaNなど）はそのまま返す
    return time_str  
df['Time_x'] = df['Time_x'].apply(convert_to_second)

In [None]:
# Distanceを分離
df['Ground'] = df['Distance'].str[0]
df['Distance'] = df['Distance'].str[1:]

# Ground, Condition, Weatherをダミー変数に変換
dummy = pd.get_dummies(df[['Ground', 'Condition', 'Weather']])

# データフレームを結合し、要らないデータを破棄
df = pd.concat([df, dummy], axis=1)
df.drop(columns=['Ground', 'Condition', 'Weather'], inplace=True)
df

# first_chars = df['Condition'].dropna().apply(lambda x: str(x)[0])  # NaNを除外し、先頭文字を取得
# unique_first_chars = first_chars.unique()  # ユニークな文字を取得

# # 結果を表示
# print("0インデックス目のユニークな文字:", list(unique_first_chars))

In [None]:
# Rankにある"(降)"を除去
df['Rank'] = df['Rank'].astype(str).str.replace(r'[()降]', '', regex = True)
df['Rank'] = df['Rank'].astype(int)

df['Distance'] = df['Distance'].astype(float)

In [None]:
df1 = df[["Rank","Kinryou","Time_x","Nobori","Tansyou","Ninki","Shoukin","Race Number","Distance","Weight","Weight Change","Sex","Age"]]
print('データ型の確認（型変換前）\n{}'.format(df1.dtypes))

In [None]:
df.to_csv('format_data.csv', index=False)