In [1]:
import json
import zipfile
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import dump, load
import os
import logging
from collections import defaultdict

# Setup logging
logging.basicConfig(
    filename="data_loading_errors.log",
    level=logging.WARNING,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Paths
zip_path = "data/dota_games.zip"
heroes_json_path = "data/heroes.json"
games_to_process = None  # Process all games


In [2]:
def load_hero_names(path):
    with open(path, "r") as f:
        heroes = json.load(f)
    return {hero["id"]: hero["api_name"] for hero in heroes}

hero_mapping = load_hero_names(heroes_json_path)
print("Loaded hero mappings:", len(hero_mapping))


Loaded hero mappings: 122


In [None]:
def load_games(zip_path, games_to_process=None):
    X, y = [], []
    
    with zipfile.ZipFile(zip_path, "r") as zip_file:
        json_files = [name for name in zip_file.namelist() if name.endswith(".json")]
        if games_to_process:
            json_files = json_files[:games_to_process]

        for file_name in tqdm(json_files, desc="Loading Games"):
            try:
                with zip_file.open(file_name, "r") as f:
                    game_data = json.load(f)

                    # Game result features
                    game_features = {
                        "radiant_score": game_data["result"]["radiant_score"],
                        "dire_score": game_data["result"]["dire_score"],
                        "duration": game_data["result"]["duration"],
                        "tower_status_radiant": game_data["result"]["tower_status_radiant"],
                        "tower_status_dire": game_data["result"]["tower_status_dire"]
                    }

                    # Player-level statistics
                    for player in game_data["result"]["players"]:
                        hero_id = player["hero_id"]
                        kda = (player["kills"] + player["assists"]) / max(1, player["deaths"])  # KDA metric
                        player_features = {
                            "hero_id": hero_id,
                            "kda": kda,
                            "gpm": player["gold_per_min"],
                            "xpm": player["xp_per_min"],
                            "last_hits": player["last_hits"],
                            "hero_damage": player["hero_damage"],
                            "tower_damage": player["tower_damage"],
                            "hero_healing": player["hero_healing"]
                        }
                        combined_features = {**game_features, **player_features}
                        X.append(combined_features)
                        y.append(1 if game_data["result"]["radiant_win"] else 0)
            except Exception as e:
                logging.warning(f"Error processing file {file_name}: {e}")
    return X, y

X_raw, y = load_games(zip_path, games_to_process)
print("Loaded game data!")


Loading Games:  49%|████████████████████████████▉                              | 1144853/2338043 [05:31<04:58, 3995.34it/s]

In [None]:
df_X = pd.DataFrame(X_raw).fillna(0)
X = df_X.drop(columns=["hero_id"]).values  # Exclude hero_id for training
y = np.array(y)

print(f"Processed data shape: {X.shape}, Target shape: {y.shape}")


In [None]:
import joblib
from datetime import datetime

class ManualLogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=500):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = 0
        self.loss_history = []

    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    def compute_loss(self, y_true, y_pred):
        m = len(y_true)
        loss = -np.mean(y_true * np.log(y_pred + 1e-9) + (1 - y_true) * np.log(1 - y_pred + 1e-9))
        return loss

    def train(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for epoch in tqdm(range(self.epochs), desc="Training Model"):
            # Compute predictions
            linear_pred = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_pred)

            # Compute gradients
            dw = np.dot(X.T, (y_pred - y)) / n_samples
            db = np.sum(y_pred - y) / n_samples

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            # Compute loss
            loss = self.compute_loss(y, y_pred)
            self.loss_history.append(loss)

            # Print loss periodically
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def predict_proba(self, X):
        linear_pred = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_pred)

    def predict(self, X):
        probabilities = self.predict_proba(X)
        return [1 if p > 0.5 else 0 for p in probabilities]

    def save_model(self, feature_columns, accuracy):
        """Save the model with epoch, loss, and accuracy details."""
        timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
        filename = f"manual_logreg_acc{accuracy:.2f}_epochs{self.epochs}_{timestamp}.joblib"
        model_data = {
            "weights": self.weights,
            "bias": self.bias,
            "columns": feature_columns,
            "loss_history": self.loss_history
        }
        joblib.dump(model_data, filename)
        print(f"Model saved as: {filename}")
        return filename

    @staticmethod
    def load_model(filename):
        """Load a saved model."""
        model_data = joblib.load(filename)
        model = ManualLogisticRegression()
        model.weights = model_data["weights"]
        model.bias = model_data["bias"]
        model.loss_history = model_data["loss_history"]
        print(f"Model loaded from: {filename}")
        return model, model_data["columns"]


In [None]:
def recommend_next_hero_with_roles(model, radiant_heroes, dire_heroes, feature_columns, hero_mapping, chosen_heroes=set()):
    roles = {"Hard Carry": [], "Core": [], "Support": []}
    hero_probabilities = []

    # Predict win probabilities for all available heroes
    available_heroes = set(hero_mapping.keys()) - set(radiant_heroes) - set(dire_heroes) - chosen_heroes
    for hero_id in available_heroes:
        test_radiant = radiant_heroes + [hero_id]
        win_prob = predict_win_probability(model, test_radiant, dire_heroes, feature_columns)
        hero_probabilities.append((hero_id, win_prob))

    # Sort heroes by win probability
    hero_probabilities = sorted(hero_probabilities, key=lambda x: x[1], reverse=True)

    # Assign heroes to roles
    for hero_id, win_prob in hero_probabilities:
        if "carry" in hero_mapping[hero_id].lower():
            roles["Hard Carry"].append((hero_id, win_prob))
        elif "support" in hero_mapping[hero_id].lower():
            roles["Support"].append((hero_id, win_prob))
        else:
            roles["Core"].append((hero_id, win_prob))

    # Return best hero per role
    best_heroes = {}
    for role, heroes in roles.items():
        if heroes:
            best_heroes[role] = heroes[0]
            chosen_heroes.add(heroes[0][0])  # Add to chosen heroes

    return best_heroes


In [None]:
# Train the model
model = ManualLogisticRegression(learning_rate=0.01, epochs=500)
model.train(X, y)

# Calculate training accuracy
y_pred = model.predict(X)
accuracy = np.mean(y == y_pred)
print(f"Training Accuracy: {accuracy:.2%}")

# Save the model
model_filename = model.save_model(df_X.columns, accuracy)


In [None]:
# Load the model
loaded_model, feature_columns = ManualLogisticRegression.load_model(model_filename)

# Predict win probability for a new draft
radiant_heroes = [7, 9, 44]
dire_heroes = [8, 4, 129]

def predict_win_probability(model, radiant_heroes, dire_heroes, feature_columns):
    draft = {f"hero_{hero_id}": 1 for hero_id in radiant_heroes}
    draft.update({f"hero_{hero_id}": -1 for hero_id in dire_heroes})
    draft_df = pd.DataFrame([draft]).reindex(columns=feature_columns, fill_value=0)
    return model.predict_proba(draft_df.values)[0]

win_prob = predict_win_probability(loaded_model, radiant_heroes, dire_heroes, feature_columns)
print(f"Win Probability for Radiant: {win_prob:.2%}")
