# 利用SGD分类器和回归方法对星际争霸2玩家段位预测

## 一、利用SGD分类器进行预测

In [1]:
import IPython
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import pydot
import pyparsing

In [2]:
star_craft_csv = pd.read_csv('SkillCraft.csv')
star_craft_csv

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.7180,0.003515,0.000220,7,0.000110,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.000000,0.000000
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001194,5,0.000000,0.000208
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,4.0430,22,0.000745,6,0.000000,0.000189
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,0.000053,0.000543,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.000000,0.000384
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.000000,0.001329,0.002368,22.6885,62.0813,9.3740,15,0.001174,4,0.000000,0.000019
5,60,2,27,6,70,44.4570,0.000978,0.000255,2,0.000000,0.000000,0.002425,76.4405,98.7719,3.0965,16,0.000372,6,0.000000,0.000000
6,61,1,21,8,240,46.9962,0.000820,0.000169,6,0.000000,0.000045,0.001988,94.0227,90.5311,4.1017,15,0.000573,5,0.000000,0.000000
7,72,7,17,42,10000,212.6022,0.009040,0.000676,6,0.001164,0.001253,0.004952,24.6117,41.7671,6.6104,45,0.002277,9,0.000129,0.000249
8,77,4,20,14,2708,117.4884,0.002944,0.000527,2,0.000019,0.000414,0.005399,52.0140,46.4321,3.3746,29,0.001035,7,0.000273,0.000470
9,81,4,18,24,800,155.9856,0.005054,0.000524,8,0.000025,0.000399,0.003569,24.4632,52.1538,6.5664,27,0.001310,6,0.000000,0.000000


In [3]:
import csv

with open('SkillCraft.csv', 'r') as csvfile:
    sc_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    sc_X, sc_y = [], []

    for row in sc_reader:
        sc_X.append(row)
        sc_y.append(row[1]) # The target value is "level"
        
    feature_names = np.array(sc_X[0])

    sc_X = np.array(sc_X[1:])
    sc_y = np.array(sc_y[1:])

print(feature_names)

['GameID' 'LeagueIndex' 'Age' 'HoursPerWeek' 'TotalHours' 'APM'
 'SelectByHotkeys' 'AssignToHotkeys' 'UniqueHotkeys' 'MinimapAttacks'
 'MinimapRightClicks' 'NumberOfPACs' 'GapBetweenPACs' 'ActionLatency'
 'ActionsInPAC' 'TotalMapExplored' 'WorkersMade' 'UniqueUnitsMade'
 'ComplexUnitsMade' 'ComplexAbilitiesUsed']


In [4]:
print(sc_X)

[['52' '5' '27' ... '6' '0' '0']
 ['55' '5' '23' ... '5' '0' '0.00020757']
 ['56' '4' '30' ... '6' '0' '0.00018876']
 ...
 ['9265' '4' '21' ... '7' '0' '0']
 ['9270' '3' '20' ... '5' '0' '0']
 ['9271' '4' '22' ... '8' '0' '5.3891e-05']]


In [5]:
print(sc_y)

['5' '5' '4' ... '4' '3' '4']


In [6]:
print (feature_names, sc_X[0], sc_y[0])

['GameID' 'LeagueIndex' 'Age' 'HoursPerWeek' 'TotalHours' 'APM'
 'SelectByHotkeys' 'AssignToHotkeys' 'UniqueHotkeys' 'MinimapAttacks'
 'MinimapRightClicks' 'NumberOfPACs' 'GapBetweenPACs' 'ActionLatency'
 'ActionsInPAC' 'TotalMapExplored' 'WorkersMade' 'UniqueUnitsMade'
 'ComplexUnitsMade' 'ComplexAbilitiesUsed'] ['52' '5' '27' '10' '3000' '143.718' '0.0035151591' '0.0002196974' '7'
 '0.0001098487' '0.0003923169' '0.0048490365' '32.6677' '40.8673' '4.7508'
 '28' '0.0013966' '6' '0' '0'] 5


In [7]:
# 保留主要信息

sc_X = sc_X[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
feature_names = feature_names[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

print (feature_names)

['HoursPerWeek' 'TotalHours' 'APM' 'SelectByHotkeys' 'AssignToHotkeys'
 'UniqueHotkeys' 'MinimapAttacks' 'MinimapRightClicks' 'NumberOfPACs'
 'GapBetweenPACs' 'ActionLatency' 'ActionsInPAC' 'TotalMapExplored'
 'WorkersMade' 'UniqueUnitsMade' 'ComplexUnitsMade' 'ComplexAbilitiesUsed']


In [8]:
print (sc_X[:20], sc_y[:20])

[['10' '3000' '143.718' '0.0035151591' '0.0002196974' '7' '0.0001098487'
  '0.0003923169' '0.0048490365' '32.6677' '40.8673' '4.7508' '28'
  '0.0013966' '6' '0' '0']
 ['10' '5000' '129.2322' '0.0033038124' '0.0002594617' '4' '0.0002940566'
  '0.0004324362' '0.0043070643' '32.9194' '42.3454' '4.8434' '22'
  '0.0011935' '5' '0' '0.00020757']
 ['10' '200' '69.9612' '0.0011010906' '0.0003355705' '4' '0.0002936242'
  '0.0004614094' '0.002925755' '44.6475' '75.3548' '4.043' '22'
  '0.00074455' '6' '0' '0.00018876']
 ['20' '400' '107.6016' '0.0010335422' '0.0002131015' '1'
  '5.32753697310659e-05' '0.0005434088' '0.0037825513' '29.2203'
  '53.7352' '4.9155' '19' '0.0004262' '7' '0' '0.00038358']
 ['10' '500' '122.8908' '0.0011360136' '0.0003273259' '2' '0'
  '0.0013285582' '0.0023682994' '22.6885' '62.0813' '9.374' '15'
  '0.0011745' '4' '0' '1.9254e-05']
 ['6' '70' '44.457' '0.0009783903' '0.0002552323' '2' '0' '0'
  '0.0024247065' '76.4405' '98.7719' '3.0965' '16' '0.00037221' '6' '0'
  '0'

In [9]:
print(sc_X.shape, sc_y.shape)

(3337, 17) (3337,)


In [10]:
# 导入分解数据模块
from sklearn.model_selection import train_test_split

In [11]:
X, y = sc_X, sc_y

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print (X_train.shape, y_train.shape)

(2502, 17) (2502,)


In [13]:
# 导入标准化函数
from sklearn.preprocessing import StandardScaler

In [14]:
# 特征标准化
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)



In [15]:
#colors = ['red', 'greenyellow', 'blue']
#for i in range(len(colors)):
#    px = X_train[:, 0][y_train == i]
#    py = X_train[:, 1][y_train == i]
#    plt.scatter(px, py, c=colors[i])
#
#plt.legend(feature_names)
#plt.xlabel('APM')
#plt.ylabel('ACTIONINPAC')

### 检验模型

In [16]:
# 创建对象 线性模型分类器 linear model classifier
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()

In [17]:
# 拟合（训练）分类器
clf.fit(X_train, y_train) # 注意参数设置警告



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
# 输出 学习得到的系数
print (clf.coef_)
print (clf.intercept_)

[[  1.42121633 -17.69282171  -7.59665013 -14.50698881  -3.24884569
   -1.16018359  -6.89178067  -1.80477967  -0.46017644  -1.3953837
   -2.0991645    1.34651679   3.41202542  -4.55221129   1.12612344
   -1.63867048  -7.24311426]
 [ -0.69590593  -7.21173598  -3.61556418  -2.47282751  -4.37822818
   -3.69149325  -5.90126729  -1.67310211  -3.48161168  -5.69229102
   -3.69466173  -1.26917671  -0.33156062  -0.69322553  -0.72622165
    2.31201711  -2.87004794]
 [ -2.05173022  -0.79058735  -7.57386304  -1.49650571   1.022582
   -3.79696449  -3.75423654   0.34455436  -0.4604971    1.41461922
    0.98769344  -0.90563266   0.80431787  -2.61954402  -0.28025117
   -3.81940794  -1.64289052]
 [ -2.37825156  -2.79729244  -2.13632425  -2.43411968   1.61272433
    0.31641371  -2.71415047   1.94479955  -4.48069657  -3.51745726
   -6.63854961  -0.21299897   1.02940235  -1.168498     1.19490654
    0.33536574   1.72864254]
 [ -3.19104394   3.21269362  -2.16982299   0.5959808    0.20776751
    0.63282741  

In [19]:
from sklearn import metrics

In [20]:
# 根据模型得到的预测值
y_train_pred = clf.predict(X_train)

# 与人工值比较，并输出得分
train_score = metrics.accuracy_score(y_train, y_train_pred)
train_score

0.3405275779376499

In [21]:
# 测试集也要标准化
X_test = scaler.transform(X_test)

# 预测
y_pred = clf.predict(X_test)

# 与人工值比较，并输出得分
test_score = metrics.accuracy_score(y_test, y_pred)
test_score



0.3221556886227545

####  K-折交叉检验分类器

In [22]:
# 导入函数等
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline

In [23]:
# 创建管线实现的复合估计器
clf = Pipeline([
        ('scaler', StandardScaler()),
        ('linear_model', SGDClassifier())
])

In [24]:
# 创建 k-折交叉验证迭代器，取 k=5
cv = KFold( 5,shuffle = True, random_state=33)

In [25]:
# 得分
cross_scores = cross_val_score(clf, X, y, cv=cv)
print(cross_scores)



[0.34281437 0.32035928 0.24137931 0.34932534 0.30284858]




#### 交叉验证精度的均值和标准差

In [26]:
from scipy.stats import sem

def mean_score(scores):
    """Print the empirical mean score and standard error of the mean."""
    return ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))

print(mean_score(cross_scores))

Mean score: 0.311 (+/-0.019)


## 二、利用回归方法预测

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [28]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2502, 17), (835, 17), (2502,), (835,))

In [29]:
# 转换为二维数组
y_train = y_train.reshape(-1,1) # 1 列
y_test  = y_test.reshape(-1,1)  # 1 列

In [30]:
# 导入标准化器
from sklearn.preprocessing import StandardScaler

In [31]:
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(y_train)



In [32]:
# 训练样本集、训练目标集 标准化
X_train = scalerX.transform(X_train)
y_train = scalery.transform(y_train)



In [33]:
# 测试样本集、测试目标集 标准化
X_test = scalerX.transform(X_test)
y_test = scalery.transform(y_test)



In [34]:
# 转为一维数组
y_train = y_train.reshape(-1,)
y_test  = y_test.reshape(-1,)

### 建立线性回归模型

In [35]:
from sklearn.model_selection import cross_val_score, KFold

In [36]:
def train_and_evaluate(clf, X_train, y_train):
    clf.fit(X_train, y_train)  # 训练
    print ("在训练集上，决定系数：",clf.score(X_train, y_train))

    # 创建 K-折
    cv = KFold(5, shuffle=True, random_state=33)                    # ms
    #cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33) # cv

    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print ("使用 K-折交叉验证的 平均决定系数:",np.mean(scores))

In [37]:
from sklearn import linear_model

In [38]:
clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, max_iter=5, tol=None, random_state=42)
train_and_evaluate(clf_sgd, X_train, y_train)

在训练集上，决定系数： 0.5487036800304088
使用 K-折交叉验证的 平均决定系数: 0.5364564611616339




In [39]:
print(clf_sgd.coef_, "\n")
print(clf_sgd.intercept_)

[ 5.53071819e-02  8.89282321e-02  5.67015529e-02  7.59342924e-02
  1.26975424e-01  4.09743965e-02  1.14582249e-01  3.06114561e-05
  1.50272566e-01 -1.34786295e-01 -2.26921720e-01  1.34353551e-02
 -1.82452041e-02  6.67159026e-02 -2.35380351e-02  1.24703502e-02
  6.46602644e-03] 

[-0.01251698]


In [40]:
clf_sgd_l2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', max_iter=5, tol=None, random_state=42)
train_and_evaluate(clf_sgd_l2, X_train, y_train)



在训练集上，决定系数： 0.54870292835222
使用 K-折交叉验证的 平均决定系数: 0.5364582747942068




####  比较两次结果得分


- 几乎没有变化
- 说明对于本例
    - <font color="red">线性回归模型效果一般</font>

### 将支持向量机 SVM 用于回归

In [41]:
# 导入模块 svm
from sklearn import svm

In [42]:
clf_svr = svm.SVR(kernel='linear')
train_and_evaluate(clf_svr, X_train, y_train)

在训练集上，决定系数： 0.5345100139319723
使用 K-折交叉验证的 平均决定系数: 0.5168210574881118


In [43]:
clf_svr_poly = svm.SVR(kernel='poly')
train_and_evaluate(clf_svr_poly, X_train, y_train)

在训练集上，决定系数： 0.5508667566932626




使用 K-折交叉验证的 平均决定系数: -2.923700255810707


In [44]:
clf_svr_rbf = svm.SVR(kernel='rbf')
train_and_evaluate(clf_svr_rbf, X_train, y_train)

在训练集上，决定系数： 0.6950436293055504




使用 K-折交叉验证的 平均决定系数: 0.5629450366107253




### 16.5.6 将`极端随机森林`用于回归

In [45]:
# 导入 ensemble
from sklearn import ensemble

In [46]:
clf_et=ensemble.ExtraTreesRegressor(n_estimators=10, random_state=33)

In [47]:
train_and_evaluate(clf_et, X_train, y_train)

在训练集上，决定系数： 1.0
使用 K-折交叉验证的 平均决定系数: 0.5417181367312038


In [48]:
# 特征重要性
feature_importance = zip(clf_et.feature_importances_,  feature_names)
print (sorted(feature_importance, key=lambda x: x[0], reverse=True))

[(0.22309520429558863, 'ActionLatency'), (0.14373317556149065, 'APM'), (0.0889273754687495, 'GapBetweenPACs'), (0.08594488294261401, 'NumberOfPACs'), (0.07819917222530212, 'TotalHours'), (0.0704764435790732, 'AssignToHotkeys'), (0.055455961285549346, 'SelectByHotkeys'), (0.036470361878926126, 'MinimapAttacks'), (0.030180315046350548, 'WorkersMade'), (0.027545910378133615, 'HoursPerWeek'), (0.026560303494042897, 'UniqueUnitsMade'), (0.025460246660333762, 'UniqueHotkeys'), (0.024035810840080964, 'TotalMapExplored'), (0.023491586192466374, 'ActionsInPAC'), (0.023376515817469946, 'MinimapRightClicks'), (0.022416450785676497, 'ComplexAbilitiesUsed'), (0.014630283548151782, 'ComplexUnitsMade')]


In [49]:
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True, show_r2_score=False):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n")

    if show_classification_report:
        print ("分类报告")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("混淆矩阵")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
    if show_r2_score:
        print ("决定系数:{0:.3f}".format(metrics.r2_score(y,y_pred)),"\n")

In [50]:
measure_performance(X_test, y_test,
                    clf_et,
                    show_accuracy=False,
                    show_classification_report=False,
                    show_confusion_matrix=False,
                    show_r2_score=True)

决定系数:0.523 

