In [167]:
import pandas as pd

In [168]:
df = pd.read_csv('data/race_results_with_ids.csv')

In [169]:
df

Unnamed: 0,Race ID,Race Name,Race Time,Track Info,Weather Icon,Grade,Finish Position,Bracket Number,Horse Number,Horse Name,...,Final Time,Margin,Position at Bends,Last 3F,Odds,Favorite,Horse Weight (kg),Trainer,Owner,Prize (¥ mil)
0,198906050510,,15:25,T1600m(R Outer),Weather01,G3,1,8,15,Yamatake Sally,...,1:35.5,,1-1-1,35.4,14.9,8.0,452(+6),S.Hatakeyama,Takeyuki Yamanaka,28.0
1,198906050510,,15:25,T1600m(R Outer),Weather01,G3,2,3,5,Takara Smile,...,1:35.5,hd,5-5-4,34.9,13.0,7.0,474(+4),E.Sakamoto,Teruo Murayama,11.0
2,198906050510,,15:25,T1600m(R Outer),Weather01,G3,3,6,10,Feather My Hat,...,1:35.5,nse,3-3-3,35.2,4.7,2.0,416(0),K.Hongo,Shadai Race Horse Co. Ltd.,7.0
3,198906050510,,15:25,T1600m(R Outer),Weather01,G3,4,1,1,Asahi Pasion,...,1:35.7,1.1/4,8-7-7,34.9,3.4,1.0,478(0),Z.Ishige,K.Terauchi,4.2
4,198906050510,,15:25,T1600m(R Outer),Weather01,G3,5,3,4,Star Roman,...,1:35.9,1.1/4,2-2-2,35.7,5.3,3.0,476(-4),K.Takamatsu,Horseman,2.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61397,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,10,7,11,Yamanin Ours,...,1:47.0,3/4,2-2-2-2,36.3,6.4,3.0,600(+9),T.Saito,Hajime Doi,
61398,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,11,8,14,Air Fanditha,...,1:47.0,hd,12-11-13-13,34.5,17.9,9.0,460(-10),M.Ikezoe,Lucky Field Co. Ltd.,
61399,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,12,8,13,Galaxy Knight,...,1:47.4,2.1/2,9-9-11-11,35.1,32.1,13.0,506(-2),T.Kikuzawa,Lion Race Horse Co. Ltd.,
61400,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,13,4,5,Ho O Purosangue,...,1:47.4,nse,6-4-3-4,35.9,10.3,6.0,490(-2),Y.Yahagi,Yoshihisa Ozasa,


In [170]:
df.columns

Index(['Race ID', 'Race Name', 'Race Time', 'Track Info', 'Weather Icon',
       'Grade', 'Finish Position', 'Bracket Number', 'Horse Number',
       'Horse Name', 'Horse ID', 'Age/Sex', 'Weight (kg)', 'Jockey',
       'Final Time', 'Margin', 'Position at Bends', 'Last 3F', 'Odds',
       'Favorite', 'Horse Weight (kg)', 'Trainer', 'Owner', 'Prize (¥ mil)'],
      dtype='object')

In [171]:
# Count the number of horses per race
race_counts = df.groupby('Race ID').size()

# Find races with exactly 16 horses
valid_races = race_counts[race_counts == 16].index

print(f"Original shape: {df.shape}")
# Filter the dataframe to keep only those races
df = df[df['Race ID'].isin(valid_races)]

# Verify the results

print(f"Number of races after filtering: {len(valid_races)}")
print(f"New shape: {df.shape}")

Original shape: (61402, 24)
Number of races after filtering: 1164
New shape: (18624, 24)


In [172]:
df = df.drop(columns=['Race Name', 'Race Time', 'Track Info','Grade','Bracket Number', 'Horse Number',
        'Jockey', 'Margin', 'Position at Bends', 'Last 3F', 'Odds',
       'Favorite', 'Trainer', 'Owner', 'Prize (¥ mil)'])

In [173]:
df

Unnamed: 0,Race ID,Weather Icon,Finish Position,Horse Name,Horse ID,Age/Sex,Weight (kg),Final Time,Horse Weight (kg)
87,198906050809,Weather03,1,Inari One,1984106229,6H,56.0,2:31.7,448(+2)
88,198906050809,Weather03,2,Super Creek,1985104409,5H,57.0,2:31.7,522(-2)
89,198906050809,Weather03,3,Sakura Hokuto O,1986106198,4H,55.0,2:32.1,450(+4)
90,198906050809,Weather03,4,Running Free,1983103889,7H,56.0,2:32.3,458(+6)
91,198906050809,Weather03,5,Oguri Cap,1985102167,5H,57.0,2:32.5,496(0)
...,...,...,...,...,...,...,...,...,...
61154,202507010911,Weather01,12,Tamamo Rock,2020104861,5H,57.0,1:52.2,502(-4)
61155,202507010911,Weather01,13,Satono Epic,2021105271,4H,56.0,1:52.2,528(+2)
61156,202507010911,Weather01,14,Futaiten Rock,2019102665,6H,57.0,1:52.2,492(-10)
61157,202507010911,Weather01,15,Loco Por Ti,2018105503,7H,57.0,1:53.2,524(+6)


In [174]:
def calculate_average_speed(df):
    """
    Calculates the average speed (time) of each horse based on its ID.

    Args:
        df: A pandas DataFrame containing horse data, including 'Horse ID' and 'Final Time'.

    Returns:
        A pandas DataFrame with an added 'average_time' column.
    """
    # Create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # Convert 'Final Time' from string format (like '1:35.5') to numerical seconds
    def convert_time_to_seconds(time_str):
        if pd.isna(time_str):
            return None
        try:
            parts = time_str.split(':')
            if len(parts) == 2:
                minutes, seconds = parts
                return float(minutes) * 60 + float(seconds)
            else:
                return float(time_str)
        except:
            return None
    
    # Create a numerical time column for calculations
    result_df['time_seconds'] = result_df['Final Time'].apply(convert_time_to_seconds)
    
    # Calculate average time for each horse
    average_times = result_df.groupby('Horse ID')['time_seconds'].mean().reset_index()
    average_times.rename(columns={'time_seconds': 'average_time'}, inplace=True)
    
    # Merge the average times back into the original dataframe
    result_df = pd.merge(result_df, average_times, on='Horse ID', how='left')
    
    # Drop the temporary column
    result_df.drop('time_seconds', axis=1, inplace=True)
    
    return result_df


In [175]:
df = calculate_average_speed(df)

In [176]:
df.head()

Unnamed: 0,Race ID,Weather Icon,Finish Position,Horse Name,Horse ID,Age/Sex,Weight (kg),Final Time,Horse Weight (kg),average_time
0,198906050809,Weather03,1,Inari One,1984106229,6H,56.0,2:31.7,448(+2),176.85
1,198906050809,Weather03,2,Super Creek,1985104409,5H,57.0,2:31.7,522(-2),176.8
2,198906050809,Weather03,3,Sakura Hokuto O,1986106198,4H,55.0,2:32.1,450(+4),178.95
3,198906050809,Weather03,4,Running Free,1983103889,7H,56.0,2:32.3,458(+6),169.8
4,198906050809,Weather03,5,Oguri Cap,1985102167,5H,57.0,2:32.5,496(0),133.033333


In [177]:
df = df.drop(columns=['Final Time', 'Horse ID'])

In [178]:
df.head()

Unnamed: 0,Race ID,Weather Icon,Finish Position,Horse Name,Age/Sex,Weight (kg),Horse Weight (kg),average_time
0,198906050809,Weather03,1,Inari One,6H,56.0,448(+2),176.85
1,198906050809,Weather03,2,Super Creek,5H,57.0,522(-2),176.8
2,198906050809,Weather03,3,Sakura Hokuto O,4H,55.0,450(+4),178.95
3,198906050809,Weather03,4,Running Free,7H,56.0,458(+6),169.8
4,198906050809,Weather03,5,Oguri Cap,5H,57.0,496(0),133.033333


In [179]:
# Split Age/Sex column into separate Age and Sex columns
df['Age'] = df['Age/Sex'].str.extract('(\d+)')  # Extract one or more digits
df['Sex'] = df['Age/Sex'].str.extract('([A-Za-z]+)')  # Extract one or more letters

# Convert Age to numeric type
df['Age'] = pd.to_numeric(df['Age'])
df.head()

Unnamed: 0,Race ID,Weather Icon,Finish Position,Horse Name,Age/Sex,Weight (kg),Horse Weight (kg),average_time,Age,Sex
0,198906050809,Weather03,1,Inari One,6H,56.0,448(+2),176.85,6,H
1,198906050809,Weather03,2,Super Creek,5H,57.0,522(-2),176.8,5,H
2,198906050809,Weather03,3,Sakura Hokuto O,4H,55.0,450(+4),178.95,4,H
3,198906050809,Weather03,4,Running Free,7H,56.0,458(+6),169.8,7,H
4,198906050809,Weather03,5,Oguri Cap,5H,57.0,496(0),133.033333,5,H


In [180]:
df.head()

Unnamed: 0,Race ID,Weather Icon,Finish Position,Horse Name,Age/Sex,Weight (kg),Horse Weight (kg),average_time,Age,Sex
0,198906050809,Weather03,1,Inari One,6H,56.0,448(+2),176.85,6,H
1,198906050809,Weather03,2,Super Creek,5H,57.0,522(-2),176.8,5,H
2,198906050809,Weather03,3,Sakura Hokuto O,4H,55.0,450(+4),178.95,4,H
3,198906050809,Weather03,4,Running Free,7H,56.0,458(+6),169.8,7,H
4,198906050809,Weather03,5,Oguri Cap,5H,57.0,496(0),133.033333,5,H


In [181]:
# Extract just the weight value from the 'Horse Weight (kg)' column
# by removing the parentheses and their contents
df['Horse Weight (kg)'] = df['Horse Weight (kg)'].str.extract('(\d+)')

# Convert to numeric type
df['Horse Weight (kg)'] = pd.to_numeric(df['Horse Weight (kg)'], errors='coerce')

# Display the first few rows to verify the changes
df.head()

Unnamed: 0,Race ID,Weather Icon,Finish Position,Horse Name,Age/Sex,Weight (kg),Horse Weight (kg),average_time,Age,Sex
0,198906050809,Weather03,1,Inari One,6H,56.0,448.0,176.85,6,H
1,198906050809,Weather03,2,Super Creek,5H,57.0,522.0,176.8,5,H
2,198906050809,Weather03,3,Sakura Hokuto O,4H,55.0,450.0,178.95,4,H
3,198906050809,Weather03,4,Running Free,7H,56.0,458.0,169.8,7,H
4,198906050809,Weather03,5,Oguri Cap,5H,57.0,496.0,133.033333,5,H


In [182]:
def create_binary_features(df):
    """
    Create binary columns for categorical variables
    
    Args:
        df: Input DataFrame
        
    Returns:
        DataFrame with binary columns
    """
    # Weather binary columns
    weather_dummies = pd.get_dummies(df['Weather Icon'], prefix='weather')
    
    # Sex binary columns
    sex_dummies = pd.get_dummies(df['Sex'], prefix='sex')
    
    # Drop original columns and concatenate binary columns
    df = df.drop(['Weather Icon', 'Sex'], axis=1)
    df = pd.concat([df, weather_dummies, sex_dummies], axis=1)
    
    return df

In [183]:
# Apply binary encoding
df = create_binary_features(df)


In [184]:
df = df.dropna()

In [185]:
# 모든 열에 대해 숫자형으로 변환 시도 (변환 불가능하면 NaN)
df = df.apply(pd.to_numeric, errors='ignore')  # 먼저 무해한 처리

# 이후, 문자열로 된 숫자 형태만 골라서 변환
for col in df.columns:
    # 문자열인데 숫자처럼 생긴 애들만 처리
    if df[col].dtype == 'object':
        try:
            # 숫자 변환 가능성이 높으면 시도
            df[col] = pd.to_numeric(df[col].str.replace(',', '').str.extract(r'([+-]?\d+\.?\d*)')[0], errors='coerce')
        except Exception as e:
            continue  # 숫자처럼 생기지 않았으면 스킵


In [186]:
flattened_rows = []
targets = []
column_names = []

for race_id, group in df.groupby('Race ID'):
    group = group.reset_index(drop=True)

    if len(group) != 16:
        continue

    # 더 좋은 분포를 위해 진짜 랜덤 셔플 (seed 고정 X)
    shuffled = group.sample(frac=1).reset_index(drop=True)

    winner_idx = shuffled[shuffled['Finish Position'] == 1].index
    if winner_idx.empty:
        continue
    target = winner_idx[0]

    flattened_row = []
    for i, (_, row_data) in enumerate(shuffled.iterrows()):
        for f in features:
            flattened_row.append(row_data[f])
            if len(flattened_rows) == 0:
                column_names.append(f'horse{i+1}_{f}')

    flattened_rows.append(flattened_row)
    targets.append(target)

# 최종 저장
X_df = pd.DataFrame(flattened_rows, columns=column_names)
y_df = pd.DataFrame({'target': targets})
final_df = pd.concat([X_df, y_df], axis=1)
final_df


Unnamed: 0,horse1_Age,horse1_Weight (kg),horse1_Horse Weight (kg),horse1_average_time,horse2_Age,horse2_Weight (kg),horse2_Horse Weight (kg),horse2_average_time,horse3_Age,horse3_Weight (kg),...,horse14_average_time,horse15_Age,horse15_Weight (kg),horse15_Horse Weight (kg),horse15_average_time,horse16_Age,horse16_Weight (kg),horse16_Horse Weight (kg),horse16_average_time,target
0,5,57.0,496.0,133.033333,7,56.0,464.0,153.100000,8,56.0,...,153.200000,4,55.0,464.0,154.750000,5,57.0,502.0,124.550000,4
1,6,57.0,434.0,99.700000,6,53.0,476.0,70.300000,6,55.0,...,101.520000,4,53.0,548.0,69.600000,7,56.0,494.0,69.300000,7
2,5,54.0,490.0,112.666667,5,54.0,476.0,112.700000,4,53.0,...,122.700000,6,50.0,548.0,123.800000,5,56.5,456.0,122.800000,6
3,4,49.0,430.0,108.300000,6,57.0,464.0,125.760000,7,56.0,...,115.200000,5,50.0,466.0,102.200000,6,51.0,458.0,108.400000,8
4,4,54.0,454.0,104.600000,4,54.0,474.0,103.150000,4,54.0,...,99.966667,4,54.0,464.0,122.100000,4,54.0,448.0,109.700000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,9,57.0,532.0,107.571429,6,58.0,482.0,96.966667,7,57.0,...,104.800000,7,59.0,508.0,96.533333,6,57.0,486.0,112.600000,13
1056,3,55.0,498.0,108.200000,3,55.0,466.0,109.500000,3,55.0,...,110.100000,3,55.0,474.0,108.500000,3,55.0,482.0,102.900000,8
1057,6,57.0,444.0,99.120000,5,57.0,490.0,93.900000,8,58.0,...,93.800000,5,57.0,494.0,98.600000,7,55.0,544.0,95.700000,2
1058,6,57.0,500.0,125.650000,4,57.5,510.0,131.900000,8,59.5,...,125.600000,5,57.5,490.0,129.800000,4,56.0,540.0,130.300000,14


In [187]:
# Save the DataFrame to a CSV file
final_df.to_csv('data/processed_race_results.csv', index=False)
print(f"DataFrame successfully saved to 'processed_race_results.csv'")

DataFrame successfully saved to 'processed_race_results.csv'
