# Rugby Data Exploration

This notebook explores the structure of the rugby match data and validates the data pipeline.

In [None]:
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from rugby_ranking.model.data import MatchDataset

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

## Load Data

Point this to your Rugby-Data repository.

In [None]:
DATA_DIR = Path("../../Rugby-Data")  # Adjust path as needed

dataset = MatchDataset(DATA_DIR)
dataset.load_json_files()

In [None]:
# Convert to DataFrame
df = dataset.to_dataframe(played_only=True)
print(f"Shape: {df.shape}")
df.head()

## Data Overview

In [None]:
print("Competitions:")
print(df['competition'].value_counts())
print("\nSeasons:")
print(df['season'].value_counts().sort_index())

In [None]:
print(f"Unique players: {df['player_name'].nunique()}")
print(f"Unique teams: {df['team'].nunique()}")
print(f"Unique matches: {df['match_id'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

## Scoring Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, score_type in zip(axes.flat, ['tries', 'conversions', 'penalties', 'drop_goals']):
    data = df[score_type]
    ax.hist(data, bins=range(0, data.max() + 2), edgecolor='black', alpha=0.7)
    ax.set_xlabel(score_type.capitalize())
    ax.set_ylabel('Frequency')
    ax.set_title(f'{score_type.capitalize()} per Player-Match\nMean: {data.mean():.2f}')

plt.tight_layout()
plt.show()

## Position Analysis

In [None]:
# Tries by position
position_tries = df.groupby('position')['tries'].agg(['sum', 'mean', 'count'])
position_tries.columns = ['total_tries', 'tries_per_match', 'appearances']

fig, ax = plt.subplots(figsize=(12, 5))
ax.bar(position_tries.index, position_tries['tries_per_match'])
ax.set_xlabel('Position (Jersey Number)')
ax.set_ylabel('Tries per Match')
ax.set_title('Try-Scoring Rate by Position')
ax.set_xticks(range(1, 24))
plt.show()

# Position labels for reference
positions = {
    1: 'Loosehead Prop', 2: 'Hooker', 3: 'Tighthead Prop',
    4: 'Lock', 5: 'Lock', 6: 'Blindside Flanker',
    7: 'Openside Flanker', 8: 'Number 8',
    9: 'Scrum-half', 10: 'Fly-half', 11: 'Left Wing',
    12: 'Inside Centre', 13: 'Outside Centre', 14: 'Right Wing', 15: 'Fullback',
    16: 'Sub', 17: 'Sub', 18: 'Sub', 19: 'Sub', 20: 'Sub', 21: 'Sub', 22: 'Sub', 23: 'Sub'
}

## Player Mobility

How often do players change teams?

In [None]:
player_teams = dataset.get_player_teams()

# Count teams per player
teams_per_player = player_teams.groupby('player_name')['team'].nunique()
print("Teams per player distribution:")
print(teams_per_player.value_counts().sort_index())

# Players who played for multiple teams
mobile_players = teams_per_player[teams_per_player > 1]
print(f"\n{len(mobile_players)} players played for multiple teams ({100*len(mobile_players)/len(teams_per_player):.1f}%)")

In [None]:
# Example: players who changed teams
if len(mobile_players) > 0:
    example_player = mobile_players.index[0]
    print(f"\nCareer of {example_player}:")
    display(player_teams[player_teams['player_name'] == example_player])

## Minutes Played Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Starters vs subs
axes[0].hist([df[df['started']]['minutes_played'], df[~df['started']]['minutes_played']], 
             bins=20, label=['Starters', 'Substitutes'], alpha=0.7)
axes[0].set_xlabel('Minutes Played')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Minutes Played by Starting Status')
axes[0].legend()

# Exposure vs tries
axes[1].scatter(df['minutes_played'], df['tries'], alpha=0.1)
axes[1].set_xlabel('Minutes Played')
axes[1].set_ylabel('Tries')
axes[1].set_title('Tries vs Exposure Time')

plt.tight_layout()
plt.show()

## Disciplinary Analysis

In [None]:
print(f"Yellow cards: {df['yellow_cards'].sum()}")
print(f"Red cards: {df['red_cards'].sum()}")
print(f"\nMatches with cards: {df[df['yellow_cards'] > 0]['match_id'].nunique()} (yellow)")
print(f"                     {df[df['red_cards'] > 0]['match_id'].nunique()} (red)")

# Cards by position
cards_by_pos = df.groupby('position')[['yellow_cards', 'red_cards']].sum()
cards_by_pos.plot(kind='bar', figsize=(12, 5))
plt.title('Cards by Position')
plt.xlabel('Position')
plt.ylabel('Total Cards')
plt.show()

## Unplayed Matches (for prediction)

In [None]:
unplayed = dataset.get_unplayed_matches()
print(f"Unplayed matches: {len(unplayed)}")

if unplayed:
    print("\nUpcoming fixtures:")
    for match in unplayed[:10]:
        print(f"  {match.home_team} vs {match.away_team} ({match.competition})")

## Summary Statistics for Modelling

In [None]:
print("Key statistics for model specification:")
print(f"  - Total observations: {len(df):,}")
print(f"  - Players: {df['player_name'].nunique():,}")
print(f"  - Teams: {df['team'].nunique()}")
print(f"  - Seasons: {df['season'].nunique()}")
print(f"  - Team-seasons: {df.groupby(['team', 'season']).ngroups}")
print(f"  - Mean tries per player-match: {df['tries'].mean():.3f}")
print(f"  - Mean penalties per player-match: {df['penalties'].mean():.3f}")
print(f"  - Mean minutes played: {df['minutes_played'].mean():.1f}")
print(f"  - Home win rate: {(df.groupby('match_id').first()['match_result'] == 'win').mean():.1%}")