In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pymongo
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["dota"]
matches_info = db["matches_info"]


In [3]:
all_hero_ids = sorted(matches_info.distinct("players.hero_id"))
print(f"Знайдено {len(all_hero_ids)} hero_id")


Знайдено 126 hero_id


In [4]:
def one_hot_heroes(hero_ids, prefix):
    vec = {f"{prefix}_{hid}": 0 for hid in all_hero_ids}
    for hid in hero_ids:
        if hid in all_hero_ids:
            vec[f"{prefix}_{hid}"] = 1
    return vec


In [5]:
player_rows = []
cursor = matches_info.find({}, {"players": 1, "radiant_win": 1})
for match in tqdm(cursor, total=matches_info.estimated_document_count(), desc="Матчі"):
    radiant_win = match.get("radiant_win", True)
    players = match.get("players", [])
    if len(players) != 10:
        continue
    radiant_team = [p["hero_id"] for p in players[:5]]
    dire_team = [p["hero_id"] for p in players[5:]]
    for p in players:
        is_radiant = p.get("isRadiant", True)
        team_heroes = radiant_team if is_radiant else dire_team
        enemy_heroes = dire_team if is_radiant else radiant_team
        win = int(is_radiant == radiant_win)
        row = {
            "hero_id": p.get("hero_id"),
            "kda": p.get("kda", 0),
            "kills_per_min": p.get("kills_per_min", 0),
            "assists": p.get("assists", 0),
            "gpm": p.get("gold_per_min", 0),
            "xpm": p.get("xp_per_min", 0),
            "hero_damage": p.get("hero_damage", 0),
            "tower_damage": p.get("tower_damage", 0),
            "healing": p.get("hero_healing", 0),
            "last_hits_per_min": p.get("benchmarks", {}).get("last_hits_per_min", {}).get("raw", 0),
            "win": win
        }
        row.update(one_hot_heroes(team_heroes, "ally"))
        row.update(one_hot_heroes(enemy_heroes, "enemy"))
        player_rows.append(row)
df = pd.DataFrame(player_rows).fillna(0)
df["kda"] = df["kda"].replace([np.inf, -np.inf], 0)
print("DataFrame shape:", df.shape)


Матчі: 100%|██████████| 44373/44373 [00:44<00:00, 994.72it/s] 


DataFrame shape: (443730, 263)


In [6]:
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

signal_cols = [
    "kda", "kills_per_min", "assists", "gpm", "xpm",
    "hero_damage", "tower_damage", "healing", "last_hits_per_min", "win"
]
winners_df = df[df['win'] == 1].copy()
winners_sample = winners_df.sample(12000, random_state=42)
features = signal_cols
X_signal = winners_sample[features]
scaler = StandardScaler()
X_signal_scaled = scaler.fit_transform(X_signal)

best_k, best_score = None, -1
for k in tqdm(range(3, 6), desc="Підбір k"):
    km = KMeans(n_clusters=k, random_state=42, n_init='auto').fit(X_signal_scaled)
    score = silhouette_score(X_signal_scaled, km.labels_)
    print(f"k={k}: silhouette={score:.3f}")
    if score > best_score:
        best_k, best_score = k, score

print("Обрано k =", best_k)
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init='auto', verbose=2)
winners_sample["style_cluster"] = kmeans.fit_predict(X_signal_scaled)



Підбір k:  33%|███▎      | 1/3 [00:03<00:06,  3.43s/it]

k=3: silhouette=0.193


Підбір k:  67%|██████▋   | 2/3 [00:04<00:02,  2.20s/it]

k=4: silhouette=0.176


Підбір k: 100%|██████████| 3/3 [00:05<00:00,  1.97s/it]

k=5: silhouette=0.183
Обрано k = 3
Initialization complete
Iteration 0, inertia 90810.79410010354.
Iteration 1, inertia 67190.71198768646.
Iteration 2, inertia 65758.34880592828.
Iteration 3, inertia 65506.40363012799.
Iteration 4, inertia 65405.76389157839.
Iteration 5, inertia 65347.26296689242.
Iteration 6, inertia 65312.98779505821.
Iteration 7, inertia 65297.52737818563.
Iteration 8, inertia 65283.975053727685.
Iteration 9, inertia 65274.205214796544.
Iteration 10, inertia 65268.48049280576.
Iteration 11, inertia 65263.59069999053.
Iteration 12, inertia 65259.42395933603.
Iteration 13, inertia 65256.21141244875.
Iteration 14, inertia 65254.530470232225.
Iteration 15, inertia 65253.10091173496.
Iteration 16, inertia 65252.53710327183.
Converged at iteration 16: center shift 5.367927484530389e-05 within tolerance 8.999999999999833e-05.





In [8]:
scaler_signal = StandardScaler()
z = pd.DataFrame(scaler_signal.fit_transform(winners_sample[signal_cols]), columns=signal_cols)
z["cluster"] = winners_sample["style_cluster"].values
centers = z.groupby("cluster").mean()

def label_rule(r):
    if r["kills_per_min"] > 0.7 and r["hero_damage"] > 0.7 and r["gpm"] > 0.7:
        return "aggressive"
    if r["assists"] > 0.35 and r["healing"] > 0.15:
        return "supporting"
    if r["last_hits_per_min"] > 0.25 and r["gpm"] > 0.15:
        return "balanced"

cluster_to_label = centers.apply(label_rule, axis=1).to_dict()
print("Мапа кластерів:", cluster_to_label)
centers


Мапа кластерів: {0: 'aggressive', 1: 'supporting', 2: 'balanced'}


Unnamed: 0_level_0,kda,kills_per_min,assists,gpm,xpm,hero_damage,tower_damage,healing,last_hits_per_min,win
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.592458,1.056073,-0.410192,1.363745,1.19705,1.057633,1.190579,-0.143072,1.146266,0.0
1,-0.210526,-0.627412,0.448123,-0.931479,-0.715056,-0.568124,-0.660073,0.232455,-0.913147,0.0
2,-0.110285,0.092088,-0.269813,0.257766,0.108834,0.023284,0.049721,-0.181295,0.365737,0.0


In [9]:
joblib.dump(scaler, "scaler_final_ready.joblib")
joblib.dump(kmeans, "kmeans_model_final_ready.joblib")


['kmeans_model_final_ready.joblib']

In [12]:
summary = df.groupby("style_cluster")[signal_cols + ["win"]].mean().round(2)
display(summary)


KeyError: 'style_cluster'

In [10]:
def get_style_for_hero(hero_stats):
    Xrow = pd.DataFrame([hero_stats])[features]
    pred = kmeans.predict(scaler.transform(Xrow))[0]
    return cluster_to_label.get(pred, f"cluster_{pred}")

In [13]:
radiant_ids = [109, 106, 97, 26, 10]
dire_ids = [39, 25, 6, 38, 71]
results = {}
for h in radiant_ids:
    hero_stats = winners_sample[winners_sample["hero_id"] == h][features].mean().to_dict()
    style = get_style_for_hero(hero_stats)
    results[h] = style

print("Рекомендований стиль для кожного героя Radiant:")
print(results)


Рекомендований стиль для кожного героя Radiant:
{109: 'aggressive', 106: 'balanced', 97: 'balanced', 26: 'supporting', 10: 'aggressive'}
