In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
plt.style.use('fivethirtyeight')

In [None]:
data = pd.read_csv('./data/lol/high_diamond_ranked_10min.csv')
data.info()

### 数据分析

共有9879条数据, 并没有缺失数据, 并且变量类型全是数值型变量, 包括连续型数值变量以及离散型数值变量.

蓝红方的胜率基本55开, 前两行是gameid, 和蓝色方是否胜利.

后面还有38个特征, 每队有19个特征, 特征类型是一样的.

- blueWardsPlaced, blueWardsDestroyed, 插眼和反眼

- blueFirstBlood 蓝色方是否拿了一血

- blueKills, blueDeaths, blueAssists 蓝色方的击杀, 死亡, 助攻, 能换算为KDA

- blueEliteMonsters, blueDragons, blueHeralds 蓝色方精英怪, 小龙, 先锋击杀数

- blueTowersDestroyed 蓝色方推塔数

- blueTotalGold 蓝色方经济

- blueAvgLevel 蓝色方平均等级

- blueTotalMinionsKilled 蓝色方击杀小兵数

- blueTotalJungleMinionsKilled 蓝色方野区野怪击杀数

- blueGoldDiff, blueExperienceDiff, blueCSPerMin, blueGoldPerMin 经济差, 经验差, 分均补刀, 分均经济

根据以上数据, 我从以下几个方向对游戏走向进行分析:

- **视野**

- **一血率**

- **KDA战损比**

- **重要资源掌控以及野区对位情况**

- **对线情况**

- **双方经济以及经验数据分析**

- **双方经验差**

#### 视野数据的分析

In [None]:
print('mean :', round(data['blueWardsPlaced'].mean(),2))
print('min :', data['blueWardsPlaced'].min())
print('max :', data['blueWardsPlaced'].max())

竟然有一局游戏, 蓝色方10分钟插了250个眼, 不太科学...

数据中应该有离群点, 我们试着找一下

In [None]:
len(data[(data.blueWardsPlaced>=50) | (data.redWardsPlaced >= 50)])

有1142个对局前10分钟, 有一方插到了50个眼, 显然这个数据不太正确. 

因此对于视野数据, 我们先把这些离群点给剔除, 再进行分析.

In [None]:
drop_index = data[(data.blueWardsPlaced>=50) | (data.redWardsPlaced >= 50)].index

In [None]:
data_v1 = data.drop(index = drop_index)

求出双方的插眼情况的比值

In [None]:
data_v1['divWardsPlaced'] = (data_v1['blueWardsPlaced'] + 1) / (data_v1['redWardsPlaced'] + 1)

In [None]:
plt.figure(figsize=(12,5))
data_v1[data_v1.blueWins == 1]['divWardsPlaced'].hist(bins = 50, color = 'blue', label = 'blue_wins', alpha=0.6)
data_v1[data_v1.blueWins == 0]['divWardsPlaced'].hist(bins = 50, color = 'red', label = 'red_wins', alpha=0.6)
plt.legend()
plt.show()

根据红蓝双方的插眼反眼数, 构造一个新的变量出来.

视野压制 = (蓝色方插眼数 - 红色方反眼数 + 1) / (红色方插眼数 - 蓝色方反眼数 + 1)

In [None]:
data_v1['VisionSuppression'] = (data_v1['blueWardsPlaced'] - data_v1['redWardsPlaced'] ) - ( data_v1['redWardsPlaced'] - data_v1['blueWardsDestroyed'])

In [None]:
plt.figure(figsize=(12,5))
data_v1[data_v1.blueWins == 1]['VisionSuppression'].hist(bins = 100, color = 'blue', label = 'blue_wins', alpha=0.6)
data_v1[data_v1.blueWins == 0]['VisionSuppression'].hist(bins = 100, color = 'red', label = 'red_wins', alpha=0.6)
plt.legend()
plt.show()

暂时看不出和胜负有很大的关联性, 那么这个数据就暂时放一边吧

#### 一血率与获胜的关系

In [None]:
data.head()

In [None]:
plt.figure(figsize = (8,8))
data[data.blueWins == 1]['blueFirstBlood'].value_counts().plot(kind='bar', color = 'blue', label = 'blue_wins', alpha = 0.6)
data[data.blueWins == 0]['blueFirstBlood'].value_counts().plot(kind='bar', color = 'red', label = 'red_wins', alpha = 0.6)
plt.legend()

蓝色方不拿一血的胜率竟然比拿一血的胜率要高.

#### KDA对获胜的影响

In [None]:
data['blueKDA'] = (data['blueKills'] + data['blueAssists']) / (data['blueDeaths'] + 1)
data['redKDA'] = (data['redKills'] + data['redAssists']) / (data['redDeaths'] + 1)

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot("blueFirstBlood","blueKDA", hue="blueWins", data=data,split=True,ax=ax[0])
ax[0].set_title('blueFirstBlood & blueKDA vs blueWins')
ax[0].set_yticks(range(0,20,10))
sns.violinplot("redFirstBlood","redKDA", hue="blueWins", data=data,split=True,ax=ax[1])
ax[1].set_title('redFirstBlood & redKDA vs blueWins')
ax[1].set_yticks(range(0,20,10))
plt.show()

可以看到的是, 不管前期有没有拿一血, 获胜方的前10分钟,队伍KDA都比较高

#### 野区的情况

首先查看重要资源掌控情况与获胜的关系.

先查看这三个特征的分布情况

blueEliteMonsters, blueDragons, blueHeralds

查看这三个变量与获胜之间的关系.

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1,3,1)
data[data.blueWins == 1]['blueEliteMonsters'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['blueEliteMonsters'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('EliteMonsters')
plt.subplot(1,3,2)
data[data.blueWins == 1]['blueDragons'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['blueDragons'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('Dragons')
plt.subplot(1,3,3)
data[data.blueWins == 1]['blueHeralds'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['blueHeralds'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.title('Heralds')
plt.legend()
plt.show()

将这三个变量与红色方的对应3个变量作差

In [None]:
data['diffEliteMonsters'] = data['blueEliteMonsters'] - data['redEliteMonsters']
data['diffDragons'] = data['blueDragons'] - data['redDragons']
data['diffHearlds'] = data['blueHeralds'] - data['redHeralds']

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1,3,1)
data[data.blueWins == 1]['diffEliteMonsters'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['diffEliteMonsters'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('diffEliteMonsters')
plt.subplot(1,3,2)
data[data.blueWins == 1]['diffDragons'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['diffDragons'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('diffDragons')
plt.subplot(1,3,3)
data[data.blueWins == 1]['diffHearlds'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['diffHearlds'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.title('diffHearlds')
plt.legend()
plt.show()

野区对位情况

blueTotalJungleMinionsKilled

redTotalJungleMinionsKilled

我们将两个变量相减得到

diffTotalJungleMinionsKilled

In [None]:
data['diffTotalJungleMinionsKilled'] = (data['blueTotalJungleMinionsKilled']) - (data['redTotalJungleMinionsKilled'])

In [None]:
data[data.blueWins == 1]['diffTotalJungleMinionsKilled'].hist(bins=40, color='blue', label = 'blueWins', alpha=0.5)
data[data.blueWins == 0]['diffTotalJungleMinionsKilled'].hist(bins=40, color='red', label = 'redWins', alpha=0.5)

可以看到的是, 野区优势对获胜也有一定的影响

#### 对线对获胜的影响

红蓝双方推塔数以及双方的补刀差

首先看双方的补刀差

In [None]:
data['diffTotalMinionsKilled'] = data['blueTotalMinionsKilled'] - data['redTotalMinionsKilled']

In [None]:
data[data.blueWins == 1]['diffTotalMinionsKilled'].hist(bins=30, color='blue', label = 'blueWins', alpha=0.5)
data[data.blueWins == 0]['diffTotalMinionsKilled'].hist(bins=30, color='red', label = 'redWins', alpha=0.5)

红蓝双方推塔数对获胜的影响

In [None]:
plt.figure(figsize=(16, 5))
plt.subplot(1,2,1)
data[data.blueWins == 1]['blueTowersDestroyed'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['blueTowersDestroyed'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('Blue_Tower')
plt.subplot(1,2,2)
data[data.blueWins == 1]['redTowersDestroyed'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['redTowersDestroyed'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('Red_Tower')

In [None]:
data['diffTowersDestroyed'] = data['blueTowersDestroyed'] - data['redTowersDestroyed']

In [None]:
data[data.blueWins == 1]['diffTowersDestroyed'].value_counts().plot(kind = 'bar', color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['diffTowersDestroyed'].value_counts().plot(kind = 'bar', color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('diffTower')

在前10分钟是否推掉对面防御塔是十分关键的.

#### 经济以及经验对获胜的影响

查看经济差以及经验差

In [None]:
plt.figure(figsize=(16, 5))
plt.subplot(1,2,1)
data[data.blueWins == 1]['blueGoldDiff'].hist(bins = 50, color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['blueGoldDiff'].hist(bins = 50, color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('blueGoldDiff')
plt.subplot(1,2,2)
data[data.blueWins == 1]['blueExperienceDiff'].hist(bins = 50, color = 'blue',label = 'blueWins', alpha = 0.5)
data[data.blueWins == 0]['blueExperienceDiff'].hist(bins = 50, color = 'red',label = 'redWins', alpha = 0.5)
plt.legend()
plt.title('blueExperienceDiff')
plt.show()

#### 组合成最后的数据集

保留 : gameId, blueWins, blueFirstBlood, redFirstBlood, blueKDA, redKDA, diffEliteMonsters, diffDragons, diffHeralds
, diffTowersDestroyed, blueGoldDiff, blueExperienceDiff, diffTotalJungleMinionsKilled, diffTotalMinionsKilled

In [None]:
data.head()

In [None]:
choose_cols = ['blueWins', 'blueFirstBlood', 'redFirstBlood', 'blueKDA', 'redKDA', 
               'diffEliteMonsters', 'diffDragons', 'diffHearlds', 'diffTowersDestroyed',
               'blueGoldDiff', 'blueExperienceDiff',
               'diffTotalJungleMinionsKilled', 'diffTotalMinionsKilled',
              ]
data_final = data[choose_cols].copy(deep = True)

In [None]:
data_final.head()

#### 变量之间的相关性

In [None]:
corr_matrix = data_final.corr()

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(corr_matrix, annot = True, fmt=".2f", cmap="YlGnBu")

In [None]:
corr_matrix['blueWins'].plot(kind = 'barh')

### 训练

首先, 划分训练集和测试集 8:2

In [None]:
from sklearn.model_selection import train_test_split

X = data_final.drop(columns = ['blueWins'])
y = data_final['blueWins']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

标准化

In [None]:
sc_cols = ['blueKDA', 'redKDA', 'blueGoldDiff', 'blueExperienceDiff', 'diffTotalJungleMinionsKilled', 'diffTotalMinionsKilled']

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[sc_cols] = sc.fit_transform(X_train[sc_cols])
X_test[sc_cols] = sc.transform(X_test[sc_cols])

In [None]:
X_train.head()

看一下所有分类器的baseline

In [None]:
class EstimatorSelectionHelper:
    # 初始化, 加载模型以及提前设置的一些超参数
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
        self.best = {}
    # 对每个模型的每组超参数都进行交叉验证
    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.best[key] = {'score':gs.best_score_,'params':gs.best_params_}
            self.grid_searches[key] = gs    
    # 对结果进行统计
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    # 最优超参数.
    def best_params(self):
        return self.best

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'GradientBoost' : GradientBoostingClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'ETC' : ExtraTreesClassifier(),
    'Bag' : BaggingClassifier(),
    'XGBoost' : XGBClassifier(),
    'SVC' : SVC(),
    'KNN' : KNeighborsClassifier(),
    'NB' : GaussianNB(),
    'QDA' : QuadraticDiscriminantAnalysis(),
    'NN' : MLPClassifier(),
}

params = {
    'Logistic Regression' : {},
    'Decision Tree' : {},
    'AdaBoost' : {},
    'GradientBoost' : {},
    'RandomForest' : {},
    'XGBoost' : {},
    'SVC' : {},
    'KNN' : {},
    'NB' : {},
    'ETC' : {},
    'Bag' : {},
    'QDA' : {},
    'NN' : {},
}


In [None]:
Helper = EstimatorSelectionHelper(models, params)
Helper.fit(X_train, y_train, cv=5)
Helper.score_summary(sort_by='mean_score')

逻辑回归的效果最好, 那么我们将逻辑回归单独拿出来进行超参数Finetine

In [None]:
lr = LogisticRegression()

params = {
    'C' : [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0],
    'penalty' : ['l1', 'l2'],
    'solver' : ['liblinear'],
}

In [None]:
gs = GridSearchCV(lr, params, cv = 5)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
lr.set_params(**gs.best_params_)

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.coef_

In [None]:
y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_true = y_test, y_pred = y_pred)
print(cr)

In [None]:
X_train.columns

In [None]:
model_importance = pd.DataFrame(columns = X_train.columns)

In [None]:
data_dict = {}
for key, val in zip(X_train.columns, lr.coef_[0]):
    data_dict[key] = val

In [None]:
data_dict