In [2]:
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import os

os.environ["XGBOOST_DISABLE_GPU"] = "1"

class ContextAwareModel:
    def __init__(self):
        self.model = xgb.XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=10,
            tree_method="hist",
            device="cpu",
            n_jobs=-1,
            random_state=42
        )
        self.encoders = {}
        self.feature_cols = [
            'venue', 'batting_team', 'bowling_team',
            'batter', 'bowler', 'over', 'innings', 'team_wicket'
        ]
        self.teams_per_year_dynamic = {}

    def load_and_train(self):
        inn1 = pd.read_csv('dataset/innings_1.csv')
        inn2 = pd.read_csv('dataset/innings_2.csv')
        df = pd.concat([inn1, inn2])
        df = df.loc[:, ~df.columns.duplicated()]

        if 'venue' not in df.columns:
            df['venue'] = df.get('city', 'Unknown')

        df['venue'] = df['venue'].astype(str)

        if 'season' in df.columns:
            df['year'] = df['season'].astype(str).str[:4].astype(int)
        else:
            df['year'] = pd.to_datetime(df['date']).dt.year

        df['team_wicket'] = df.get('team_wicket', 0)
        df.dropna(subset=['batter', 'bowler', 'runs_total', 'over'], inplace=True)

        for col in ['venue', 'batting_team', 'bowling_team', 'batter', 'bowler']:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            self.encoders[col] = le

        df['over'] = df['over'].astype(int)
        df['innings'] = df['innings'].astype(int)
        df['team_wicket'] = df['team_wicket'].astype(int)

        y_raw = np.where(df['is_wicket'] == 1, -1, df['runs_total'])
        self.target_encoder = LabelEncoder()
        y = self.target_encoder.fit_transform(y_raw)

        self.model.fit(df[self.feature_cols], y)
        self.df = df

        temp = df[['year', 'batting_team']]
        temp['team_name'] = self.encoders['batting_team'].inverse_transform(temp['batting_team'])
        for y in temp['year'].unique():
            self.teams_per_year_dynamic[y] = sorted(
                temp[temp['year'] == y]['team_name'].unique()
            )

    def get_squad(self, team_name, year):
        team_le = self.encoders['batting_team']
        if team_name not in team_le.classes_:
            return [], None, team_name

        team_id = team_le.transform([team_name])[0]
        subset = self.df[(self.df['year'] == year) & (self.df['batting_team'] == team_id)]

        if subset.empty:
            return [], team_id, team_name

        top_bats = subset.groupby('batter')['runs_total'].sum().nlargest(7).index.tolist()
        subset_bowl = self.df[(self.df['year'] == year) & (self.df['bowling_team'] == team_id)]
        top_bowls = subset_bowl[subset_bowl['is_wicket'] == 1] \
            .groupby('bowler')['is_wicket'].sum().nlargest(5).index.tolist()

        squad_ids = list(set(top_bats + top_bowls))[:11]
        squad_names = self.encoders['batter'].inverse_transform(squad_ids)
        return list(zip(squad_names, squad_ids)), team_id, team_name

    def predict_ball(self, input_vec):
        probs = self.model.predict_proba(input_vec.astype(np.float32))[0]
        idx = np.random.choice(len(probs), p=probs)
        return self.target_encoder.inverse_transform([idx])[0]


if __name__ == "__main__":
    sim = ContextAwareModel()
    sim.load_and_train()

    with open("context_simulator.pkl", "wb") as f:
        pickle.dump(sim, f)

    print("✅ Context-aware simulator saved.")


  inn1 = pd.read_csv('dataset/innings_1.csv')
  inn2 = pd.read_csv('dataset/innings_2.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['team_name'] = self.encoders['batting_team'].inverse_transform(temp['batting_team'])


✅ Context-aware simulator saved.


In [22]:
import pandas as pd

inn1 = pd.read_csv("dataset/innings_1.csv")
inn2 = pd.read_csv("dataset/innings_2.csv")

# Ensure common match id
match_id_col = "match_id"  # adjust if named differently

# -------- First Innings --------
first = (
    inn1[inn1["innings"] == 1]
    .groupby(match_id_col)
    .agg(
        season=("year", "first"),
        team1=("batting_team", "first"),
        venue=("venue", "first"),
        score1=("runs_total", "sum"),
    )
    .reset_index()
)

# -------- Second Innings --------
second = (
    inn2[inn2["innings"] == 2]
    .groupby(match_id_col)
    .agg(
        team2=("batting_team", "first"),
        score2=("runs_total", "sum"),
    )
    .reset_index()
)

# -------- Merge --------
matches = first.merge(second, on=match_id_col)

# -------- Winner --------
matches["winner"] = matches.apply(
    lambda x: x.team1 if x.score1 > x.score2 else x.team2,
    axis=1
)

matches = matches[
    ["season", "team1", "team2", "venue", "winner", "score1", "score2"]
].sample(100)

matches.to_csv("dataset/match_level_data.csv", index=False)

print("✅ match_level_data.csv created")


  inn1 = pd.read_csv("dataset/innings_1.csv")


✅ match_level_data.csv created


  inn2 = pd.read_csv("dataset/innings_2.csv")


In [24]:
import pickle
import pandas as pd
import numpy as np
from collections import defaultdict

# ================= CONFIG =================
SIMULATIONS_PER_MATCH = 10
SIMULATOR_PATH = "context_simulator.pkl"
MATCH_DATA_PATH = "dataset/match_level_data.csv"
# ==========================================


# ---------- TEAM NORMALIZATION ----------
TEAM_TRANSLATION = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Kings XI Punjab': 'Punjab Kings',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru',
    'Rising Pune Supergiants': 'Rising Pune Supergiant',
}

def normalize_team(name):
    return TEAM_TRANSLATION.get(name, name)


# ---------- VENUE NORMALIZATION ----------
VENUE_MAP = {
    "Mumbai": "Wankhede Stadium",
    "Bangalore": "M Chinnaswamy Stadium",
    "Chennai": "MA Chidambaram Stadium",
    "Kolkata": "Eden Gardens",
    "Delhi": "Arun Jaitley Stadium",
    "Abu Dhabi": "Abu Dhabi",
    "Dubai": "Dubai",
    "Sharjah": "Sharjah",
}

def normalize_venue(v):
    return VENUE_MAP.get(v, v)


# ---------- LOAD SIMULATOR ----------
with open(SIMULATOR_PATH, "rb") as f:
    simulator = pickle.load(f)


# ---------- MATCH SIMULATION ----------
def simulate_match(sim, team1, team2, year, venue, n_sim=100):

    # ---- SEASON SAFETY ----
    available_years = sorted(sim.teams_per_year_dynamic.keys())
    if year not in available_years:
        year = available_years[-1]

    squad1, id1, _ = sim.get_squad(team1, year)
    squad2, id2, _ = sim.get_squad(team2, year)

    if not squad1 or not squad2:
        return None

    venue = normalize_venue(venue)
    try:
        vid = sim.encoders['venue'].transform([venue])[0]
    except:
        vid = sim.encoders['venue'].transform(['Wankhede Stadium'])[0]

    wins_team1 = 0
    scores1, scores2 = [], []

    for _ in range(n_sim):

        # -------- INNINGS 1 --------
        r1, w1 = 0, 0
        st, nst = 0, 1
        bat1 = [x[1] for x in squad1]
        bowl2 = [x[1] for x in squad2]

        for over in range(20):
            if w1 >= 10:
                break
            bowler = bowl2[over % len(bowl2)]
            for _ in range(6):
                if w1 >= 10:
                    break
                inp = np.array([[vid, id1, id2, bat1[st], bowler, over, 1, w1]])
                out = sim.predict_ball(inp)
                if out == -1:
                    w1 += 1
                    st = max(st, nst) + 1
                    if st >= len(bat1):
                        break
                else:
                    r1 += out
                    if out % 2:
                        st, nst = nst, st
            st, nst = nst, st

        # -------- INNINGS 2 --------
        r2, w2 = 0, 0
        st, nst = 0, 1
        bat2 = [x[1] for x in squad2]
        bowl1 = [x[1] for x in squad1]

        for over in range(20):
            if w2 >= 10 or r2 > r1:
                break
            bowler = bowl1[over % len(bowl1)]
            for _ in range(6):
                if w2 >= 10 or r2 > r1:
                    break
                inp = np.array([[vid, id2, id1, bat2[st], bowler, over, 2, w2]])
                out = sim.predict_ball(inp)
                if out == -1:
                    w2 += 1
                    st = max(st, nst) + 1
                    if st >= len(bat2):
                        break
                else:
                    r2 += out
                    if out % 2:
                        st, nst = nst, st
            st, nst = nst, st

        scores1.append(r1)
        scores2.append(r2)

        if r1 > r2:
            wins_team1 += 1

    return {
        "prob_team1": wins_team1 / n_sim,
        "avg_score1": np.mean(scores1),
        "avg_score2": np.mean(scores2),
    }


# ================= LOAD MATCH DATA =================
matches = pd.read_csv(MATCH_DATA_PATH)

correct_winner = 0
total_matches = 0
score_errors = []
calibration_bins = defaultdict(list)
skipped = 0


# ================= EVALUATION LOOP =================
for _, m in matches.iterrows():

    team1 = normalize_team(m.team1)
    team2 = normalize_team(m.team2)
    actual_winner = normalize_team(m.winner)

    result = simulate_match(
        simulator,
        team1,
        team2,
        int(m.season),
        m.venue,
        SIMULATIONS_PER_MATCH
    )

    if result is None:
        skipped += 1
        continue

    total_matches += 1

    pred_prob = result["prob_team1"]
    pred_winner = team1 if pred_prob >= 0.5 else team2

    if pred_winner == actual_winner:
        correct_winner += 1

    score_errors.append(abs(result["avg_score1"] - m.score1))
    score_errors.append(abs(result["avg_score2"] - m.score2))

    bin_key = round(pred_prob, 1)
    calibration_bins[bin_key].append(1 if actual_winner == team1 else 0)


# ================= RESULTS =================
if total_matches == 0:
    raise RuntimeError("No matches evaluated — check team/season/venue alignment")

print("\n=== FANTASY SIMULATOR EVALUATION ===")
print(f"Matches evaluated : {total_matches}")
print(f"Matches skipped   : {skipped}")
print(f"Win Accuracy      : {correct_winner / total_matches:.3f}")
print(f"Score MAE         : {np.mean(score_errors):.2f} runs")

print("\n--- Probability Calibration ---")
for b in sorted(calibration_bins):
    actual = np.mean(calibration_bins[b])
    print(f"Predicted {b:.1f} → Actual {actual:.2f}")



=== FANTASY SIMULATOR EVALUATION ===
Matches evaluated : 100
Matches skipped   : 0
Win Accuracy      : 0.540
Score MAE         : 35.23 runs

--- Probability Calibration ---
Predicted 0.1 → Actual 0.00
Predicted 0.2 → Actual 0.50
Predicted 0.3 → Actual 0.00
Predicted 0.4 → Actual 0.62
Predicted 0.5 → Actual 0.60
Predicted 0.6 → Actual 0.29
Predicted 0.7 → Actual 0.53
Predicted 0.8 → Actual 0.71
Predicted 0.9 → Actual 0.62
Predicted 1.0 → Actual 0.50
