In [43]:
import pandas as pd

In [44]:
df = pd.read_csv('data/race_results_with_ids.csv')

In [45]:
df

Unnamed: 0,Race ID,Race Name,Race Time,Track Info,Weather Icon,Grade,Finish Position,Bracket Number,Horse Number,Horse Name,...,Final Time,Margin,Position at Bends,Last 3F,Odds,Favorite,Horse Weight (kg),Trainer,Owner,Prize (¥ mil)
0,198906050510,,15:25,T1600m(R Outer),Weather01,G3,1,8,15,Yamatake Sally,...,1:35.5,,1-1-1,35.4,14.9,8.0,452(+6),S.Hatakeyama,Takeyuki Yamanaka,28.0
1,198906050510,,15:25,T1600m(R Outer),Weather01,G3,2,3,5,Takara Smile,...,1:35.5,hd,5-5-4,34.9,13.0,7.0,474(+4),E.Sakamoto,Teruo Murayama,11.0
2,198906050510,,15:25,T1600m(R Outer),Weather01,G3,3,6,10,Feather My Hat,...,1:35.5,nse,3-3-3,35.2,4.7,2.0,416(0),K.Hongo,Shadai Race Horse Co. Ltd.,7.0
3,198906050510,,15:25,T1600m(R Outer),Weather01,G3,4,1,1,Asahi Pasion,...,1:35.7,1.1/4,8-7-7,34.9,3.4,1.0,478(0),Z.Ishige,K.Terauchi,4.2
4,198906050510,,15:25,T1600m(R Outer),Weather01,G3,5,3,4,Star Roman,...,1:35.9,1.1/4,2-2-2,35.7,5.3,3.0,476(-4),K.Takamatsu,Horseman,2.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61397,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,10,7,11,Yamanin Ours,...,1:47.0,3/4,2-2-2-2,36.3,6.4,3.0,600(+9),T.Saito,Hajime Doi,
61398,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,11,8,14,Air Fanditha,...,1:47.0,hd,12-11-13-13,34.5,17.9,9.0,460(-10),M.Ikezoe,Lucky Field Co. Ltd.,
61399,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,12,8,13,Galaxy Knight,...,1:47.4,2.1/2,9-9-11-11,35.1,32.1,13.0,506(-2),T.Kikuzawa,Lion Race Horse Co. Ltd.,
61400,202510011011,KOKURA DAISHOTEN,15:20,T1800m(R),Weather02,G3,13,4,5,Ho O Purosangue,...,1:47.4,nse,6-4-3-4,35.9,10.3,6.0,490(-2),Y.Yahagi,Yoshihisa Ozasa,


In [46]:
df.columns

Index(['Race ID', 'Race Name', 'Race Time', 'Track Info', 'Weather Icon',
       'Grade', 'Finish Position', 'Bracket Number', 'Horse Number',
       'Horse Name', 'Horse ID', 'Age/Sex', 'Weight (kg)', 'Jockey',
       'Final Time', 'Margin', 'Position at Bends', 'Last 3F', 'Odds',
       'Favorite', 'Horse Weight (kg)', 'Trainer', 'Owner', 'Prize (¥ mil)'],
      dtype='object')

In [47]:
# Count the number of horses per race
race_counts = df.groupby('Race ID').size()

# Find races with exactly 16 horses
valid_races = race_counts[race_counts == 16].index

print(f"Original shape: {df.shape}")
# Filter the dataframe to keep only those races
df = df[df['Race ID'].isin(valid_races)]

# Verify the results

print(f"Number of races after filtering: {len(valid_races)}")
print(f"New shape: {df.shape}")

Original shape: (61402, 24)
Number of races after filtering: 1164
New shape: (18624, 24)


In [48]:
df = df.drop(columns=['Race ID', 'Race Name', 'Race Time', 'Track Info','Grade','Bracket Number', 'Horse Number',
       'Horse Name', 'Jockey', 'Margin', 'Position at Bends', 'Last 3F', 'Odds',
       'Favorite', 'Trainer', 'Owner', 'Prize (¥ mil)'])

In [49]:
df

Unnamed: 0,Weather Icon,Finish Position,Horse ID,Age/Sex,Weight (kg),Final Time,Horse Weight (kg)
87,Weather03,1,1984106229,6H,56.0,2:31.7,448(+2)
88,Weather03,2,1985104409,5H,57.0,2:31.7,522(-2)
89,Weather03,3,1986106198,4H,55.0,2:32.1,450(+4)
90,Weather03,4,1983103889,7H,56.0,2:32.3,458(+6)
91,Weather03,5,1985102167,5H,57.0,2:32.5,496(0)
...,...,...,...,...,...,...,...
61154,Weather01,12,2020104861,5H,57.0,1:52.2,502(-4)
61155,Weather01,13,2021105271,4H,56.0,1:52.2,528(+2)
61156,Weather01,14,2019102665,6H,57.0,1:52.2,492(-10)
61157,Weather01,15,2018105503,7H,57.0,1:53.2,524(+6)


In [50]:
def calculate_average_speed(df):
    """
    Calculates the average speed (time) of each horse based on its ID.

    Args:
        df: A pandas DataFrame containing horse data, including 'Horse ID' and 'Final Time'.

    Returns:
        A pandas DataFrame with an added 'average_time' column.
    """
    # Create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # Convert 'Final Time' from string format (like '1:35.5') to numerical seconds
    def convert_time_to_seconds(time_str):
        if pd.isna(time_str):
            return None
        try:
            parts = time_str.split(':')
            if len(parts) == 2:
                minutes, seconds = parts
                return float(minutes) * 60 + float(seconds)
            else:
                return float(time_str)
        except:
            return None
    
    # Create a numerical time column for calculations
    result_df['time_seconds'] = result_df['Final Time'].apply(convert_time_to_seconds)
    
    # Calculate average time for each horse
    average_times = result_df.groupby('Horse ID')['time_seconds'].mean().reset_index()
    average_times.rename(columns={'time_seconds': 'average_time'}, inplace=True)
    
    # Merge the average times back into the original dataframe
    result_df = pd.merge(result_df, average_times, on='Horse ID', how='left')
    
    # Drop the temporary column
    result_df.drop('time_seconds', axis=1, inplace=True)
    
    return result_df


In [51]:
df = calculate_average_speed(df)

In [52]:
df.head()

Unnamed: 0,Weather Icon,Finish Position,Horse ID,Age/Sex,Weight (kg),Final Time,Horse Weight (kg),average_time
0,Weather03,1,1984106229,6H,56.0,2:31.7,448(+2),176.85
1,Weather03,2,1985104409,5H,57.0,2:31.7,522(-2),176.8
2,Weather03,3,1986106198,4H,55.0,2:32.1,450(+4),178.95
3,Weather03,4,1983103889,7H,56.0,2:32.3,458(+6),169.8
4,Weather03,5,1985102167,5H,57.0,2:32.5,496(0),133.033333


In [53]:
df = df.drop(columns=['Final Time'])

In [54]:
df.head()

Unnamed: 0,Weather Icon,Finish Position,Horse ID,Age/Sex,Weight (kg),Horse Weight (kg),average_time
0,Weather03,1,1984106229,6H,56.0,448(+2),176.85
1,Weather03,2,1985104409,5H,57.0,522(-2),176.8
2,Weather03,3,1986106198,4H,55.0,450(+4),178.95
3,Weather03,4,1983103889,7H,56.0,458(+6),169.8
4,Weather03,5,1985102167,5H,57.0,496(0),133.033333


In [55]:
# Split Age/Sex column into separate Age and Sex columns
df['Age'] = df['Age/Sex'].str.extract('(\d+)')  # Extract one or more digits
df['Sex'] = df['Age/Sex'].str.extract('([A-Za-z]+)')  # Extract one or more letters

# Convert Age to numeric type
df['Age'] = pd.to_numeric(df['Age'])
df.head()

Unnamed: 0,Weather Icon,Finish Position,Horse ID,Age/Sex,Weight (kg),Horse Weight (kg),average_time,Age,Sex
0,Weather03,1,1984106229,6H,56.0,448(+2),176.85,6,H
1,Weather03,2,1985104409,5H,57.0,522(-2),176.8,5,H
2,Weather03,3,1986106198,4H,55.0,450(+4),178.95,4,H
3,Weather03,4,1983103889,7H,56.0,458(+6),169.8,7,H
4,Weather03,5,1985102167,5H,57.0,496(0),133.033333,5,H


In [56]:
df = df.drop(columns=['Age/Sex', 'Horse ID'])

In [57]:
df.head()

Unnamed: 0,Weather Icon,Finish Position,Weight (kg),Horse Weight (kg),average_time,Age,Sex
0,Weather03,1,56.0,448(+2),176.85,6,H
1,Weather03,2,57.0,522(-2),176.8,5,H
2,Weather03,3,55.0,450(+4),178.95,4,H
3,Weather03,4,56.0,458(+6),169.8,7,H
4,Weather03,5,57.0,496(0),133.033333,5,H


In [58]:
# Extract just the weight value from the 'Horse Weight (kg)' column
# by removing the parentheses and their contents
df['Horse Weight (kg)'] = df['Horse Weight (kg)'].str.extract('(\d+)')

# Convert to numeric type
df['Horse Weight (kg)'] = pd.to_numeric(df['Horse Weight (kg)'], errors='coerce')

# Display the first few rows to verify the changes
df.head()

Unnamed: 0,Weather Icon,Finish Position,Weight (kg),Horse Weight (kg),average_time,Age,Sex
0,Weather03,1,56.0,448.0,176.85,6,H
1,Weather03,2,57.0,522.0,176.8,5,H
2,Weather03,3,55.0,450.0,178.95,4,H
3,Weather03,4,56.0,458.0,169.8,7,H
4,Weather03,5,57.0,496.0,133.033333,5,H


In [59]:
def create_binary_features(df):
    """
    Create binary columns for categorical variables
    
    Args:
        df: Input DataFrame
        
    Returns:
        DataFrame with binary columns
    """
    # Weather binary columns
    weather_dummies = pd.get_dummies(df['Weather Icon'], prefix='weather')
    
    # Sex binary columns
    sex_dummies = pd.get_dummies(df['Sex'], prefix='sex')
    
    # Drop original columns and concatenate binary columns
    df = df.drop(['Weather Icon', 'Sex'], axis=1)
    df = pd.concat([df, weather_dummies, sex_dummies], axis=1)
    
    return df

In [60]:
# Apply binary encoding
df = create_binary_features(df)


In [61]:
df['Finish Position'] = pd.to_numeric(df['Finish Position'], errors='coerce')
df['is_top1'] = (df['Finish Position'] <= 1).astype(int)
df.drop(columns=['Finish Position'], inplace=True)
df.head()

Unnamed: 0,Weight (kg),Horse Weight (kg),average_time,Age,weather_Weather01,weather_Weather02,weather_Weather03,weather_Weather04,sex_C,sex_F,sex_G,sex_H,sex_M,is_top1
0,56.0,448.0,176.85,6,False,False,True,False,False,False,False,True,False,1
1,57.0,522.0,176.8,5,False,False,True,False,False,False,False,True,False,0
2,55.0,450.0,178.95,4,False,False,True,False,False,False,False,True,False,0
3,56.0,458.0,169.8,7,False,False,True,False,False,False,False,True,False,0
4,57.0,496.0,133.033333,5,False,False,True,False,False,False,False,True,False,0


In [62]:
df = df.dropna()

In [63]:
# Save the DataFrame to a CSV file
df.to_csv('data/processed_race_results.csv', index=False)
print(f"DataFrame successfully saved to 'processed_race_results.csv'")

DataFrame successfully saved to 'processed_race_results.csv'
