In [1]:
import pandas as pd

In [4]:
ends_df = pd.read_csv('/Users/brentkong/Documents/curling/data/Ends.csv')
games_df = pd.read_csv('/Users/brentkong/Documents/curling/data/Games.csv')
stones_df = pd.read_csv('/Users/brentkong/Documents/curling/data/Stones.csv')

In [5]:
# Create unique match identifier
games_df['match_id'] = games_df['CompetitionID'].astype(str) + '_' + \
                      games_df['SessionID'].astype(str) + '_' + \
                      games_df['GameID'].astype(str)

ends_df['match_id'] = ends_df['CompetitionID'].astype(str) + '_' + \
                     ends_df['SessionID'].astype(str) + '_' + \
                     ends_df['GameID'].astype(str)

stones_df['match_id'] = stones_df['CompetitionID'].astype(str) + '_' + \
                       stones_df['SessionID'].astype(str) + '_' + \
                       stones_df['GameID'].astype(str)

# Merge games with ends for game context
merged = ends_df.merge(games_df, on=['match_id', 'CompetitionID', 'SessionID', 'GameID'])

In [None]:
def create_game_state_features(df):
    # Score differential before end
    df['score_diff'] = df.groupby(['match_id', 'TeamID'])['Result'].transform(
        lambda x: x.shift().cumsum().fillna(0)
    )
    
    # Calculate cumulative score differential for each team
    df['cum_score'] = df.groupby(['match_id', 'TeamID'])['Result'].cumsum()
    
    # Determine hammer (last stone advantage) for each end
    # Hammer alternates starting from LSFE
    df['has_hammer'] = ((df['EndID'] % 2 == 1) & (df['LSFE'] == 1)) | \
                       ((df['EndID'] % 2 == 0) & (df['LSFE'] == 0))
    
    # Power Play eligibility (not used yet in game)
    df['pp_available'] = df.groupby(['match_id', 'TeamID'])['PowerPlay'].transform(
        lambda x: x.fillna(0).cumsum() == 0
    )
    
    return df

In [None]:
def extract_geometric_features(stone_df):
    features = []
    
    # Filter to only in-play stones (not 0 or 4095)
    stone_cols = [f'stone_{i}_x' for i in range(1, 13)] + \
                 [f'stone_{i}_y' for i in range(1, 13)]
    
    for idx, row in stone_df.iterrows():
        stones_in_play = []
        house_stones = []
        guard_stones = []
        
        # Button coordinates (house center)
        button_x, button_y = 750, 3000  # Assuming top house
        
        for i in range(1, 13):
            x = row[f'stone_{i}_x']
            y = row[f'stone_{i}_y']
            
            if x not in [0, 4095] and y not in [0, 4095]:
                stones_in_play.append((x, y))
                
                # Distance from button
                dist = np.sqrt((x - button_x)**2 + (y - button_y)**2)
                
                # Categorize stones
                if dist < 180:  # House stones (within 6ft circle)
                    house_stones.append((x, y, dist))
                elif 180 <= dist < 600:  # Guard zone
                    guard_stones.append((x, y, dist))
        
        # Feature 1: Guard Coverage Quality
        if guard_stones:
            avg_guard_dist = np.mean([g[2] for g in guard_stones])
            guard_angle_std = np.std([np.arctan2(g[1]-button_y, g[0]-button_x) 
                                    for g in guard_stones])
        else:
            avg_guard_dist = 1000  # Large penalty for no guards
            guard_angle_std = 0
        
        # Feature 2: House Occupation
        num_house_stones = len(house_stones)
        if house_stones:
            house_centroid = np.mean([(h[0], h[1]) for h in house_stones], axis=0)
            house_spread = np.std([h[2] for h in house_stones])
        else:
            house_centroid = (button_x, button_y)
            house_spread = 0
        
        # Feature 3: Draw Path Analysis
        # Check if there's a clear path to button
        left_path_clear = True
        right_path_clear = True
        path_y_start = 0  # Assuming throw from bottom
        
        # Simplified path check (would need actual line-of-sight algorithm)
        for stone in stones_in_play:
            if stone[1] > 1500:  # Stone in playing area
                if stone[0] < 600:  # Left side
                    left_path_clear = False
                elif stone[0] > 900:  # Right side
                    right_path_clear = False
        
        # Feature 4: Stone Clustering
        if len(stones_in_play) > 1:
            positions = np.array(stones_in_play)
            distances = pdist(positions)
            avg_inter_stone_dist = np.mean(distances)
            clustering_index = 1 / (avg_inter_stone_dist + 1)  # Higher = more clustered
        else:
            clustering_index = 0
        
        features.append({
            'match_id': row['match_id'],
            'EndID': row['EndID'],
            'ShotID': row['ShotID'],
            'num_stones_in_play': len(stones_in_play),
            'num_house_stones': num_house_stones,
            'num_guards': len(guard_stones),
            'avg_guard_distance': avg_guard_dist,
            'guard_angle_std': guard_angle_std,
            'house_spread': house_spread,
            'left_path_clear': left_path_clear,
            'right_path_clear': right_path_clear,
            'clustering_index': clustering_index,
            'distance_to_button': np.min([h[2] for h in house_stones]) if house_stones else 1000
        })
    
    return pd.DataFrame(features)

In [None]:
def create_opponent_features(df):
    # Calculate team statistics
    team_stats = df.groupby('TeamID').agg({
        'Result': ['mean', 'std', 'count'],
        'PowerPlay': lambda x: (x.notna().sum() / len(x) if len(x) > 0 else 0)
    }).round(3)
    
    team_stats.columns = ['avg_points', 'std_points', 'games_played', 'pp_usage_rate']
    
    # Merge opponent stats
    df['opponent_avg_points'] = df.apply(
        lambda row: team_stats.loc[row['TeamID2'] if row['TeamID'] == row['TeamID1'] else row['TeamID1'], 'avg_points'],
        axis=1
    )
    
    # Calculate team's historical Power Play success
    pp_success = df[df['PowerPlay'].notna()].groupby('TeamID').agg({
        'Result': 'mean'
    }).rename(columns={'Result': 'pp_success_rate'})
    
    df = df.merge(pp_success, left_on='TeamID', right_index=True, how='left')
    
    return df

In [None]:
def create_decision_dataset(merged_df, stone_features):
    # Aggregate stone features to end level (use last shot of end)
    end_geometry = stone_features.sort_values(['match_id', 'EndID', 'ShotID'])\
                                 .groupby(['match_id', 'EndID']).last().reset_index()
    
    # Merge with game data
    decision_df = merged_df.merge(end_geometry, on=['match_id', 'EndID'], how='left')
    
    # Create target variable: points scored in end
    decision_df['points_scored'] = decision_df['Result']
    
    # Create binary indicators for Power Play decisions
    decision_df['used_pp'] = decision_df['PowerPlay'].notna().astype(int)
    decision_df['pp_side'] = decision_df['PowerPlay'].fillna(0)
    
    # Create lagged features (state at decision time)
    decision_df['score_diff_before'] = decision_df.groupby(['match_id', 'TeamID'])['cum_score'].shift(1)
    decision_df['cum_ends_played'] = decision_df.groupby(['match_id', 'TeamID']).cumcount() + 1
    
    # Filter to only decision points (team has hammer and PP available)
    decision_points = decision_df[
        (decision_df['has_hammer']) & 
        (decision_df['pp_available']) & 
        (decision_df['EndID'] <= 8)  # Regular ends only
    ].copy()
    
    return decision_points

In [None]:
final_features = [
    # Game Context
    'EndID',
    'score_diff_before',
    'cum_ends_played',
    'has_hammer',
    
    # Geometric Features
    'num_stones_in_play',
    'num_house_stones',
    'num_guards',
    'avg_guard_distance',
    'guard_angle_std',
    'house_spread',
    'left_path_clear',
    'right_path_clear',
    'clustering_index',
    'distance_to_button',
    
    # Opponent Features
    'opponent_avg_points',
    'pp_success_rate',
    
    # Team Identity (for hierarchical modeling)
    'TeamID',
    'TeamID1',
    'TeamID2',
    
    # Outcome
    'points_scored',
    'used_pp',
    'pp_side'
]

In [None]:
# For Bayesian hierarchical model
model_data = {
    'N': len(decision_points),
    'K': len(geometric_features),  # Number of features
    'J': decision_points['TeamID'].nunique(),  # Number of teams
    
    # Features
    'X': decision_points[geometric_features].values,
    'team_id': decision_points['TeamID'].astype('category').cat.codes.values + 1,
    
    # Context
    'end_number': decision_points['EndID'].values,
    'score_diff': decision_points['score_diff_before'].values,
    'has_hammer': decision_points['has_hammer'].astype(int).values,
    
    # Treatment (Power Play)
    'used_pp': decision_points['used_pp'].values,
    'pp_side': decision_points['pp_side'].values,
    
    # Outcome
    'y': decision_points['points_scored'].values
}

In [None]:
def prepare_simulation_data(stones_df, games_df):
    """Prepare data for shot-by-shot simulation"""
    
    # Create shot sequences with full context
    shot_data = stones_df.copy()
    
    # Add game info
    shot_data = shot_data.merge(
        games_df[['match_id', 'TeamID1', 'TeamID2', 'NOC1', 'NOC2']],
        on='match_id'
    )
    
    # Determine which team is throwing
    shot_data['throwing_team'] = np.where(
        shot_data['TeamID'] == shot_data['TeamID1'], 
        'team1', 
        'team2'
    )
    
    # Calculate stone state before each shot
    stone_cols = [f'stone_{i}_x' for i in range(1, 13)] + \
                 [f'stone_{i}_y' for i in range(1, 13)]
    
    # Create shot outcomes
    shot_data['shot_success'] = shot_data['Points'] / 4  # Normalize 0-1
    
    return shot_data