In [44]:
#データの読込
import numpy as np
import pandas as pd
import codecs

with codecs.open("input_data.csv","r","Shift-JIS","ignore") as file:
    df = pd.read_csv(file)
with codecs.open("input_data.csv","r","Shift-JIS","ignore") as file2:
    df2 = pd.read_csv(file2)

0         PG
1         PG
2         SF
3      PG/SG
4         SF
5         SF
6      SF/PF
7         PG
8         SG
9      PG/SG
10        PG
11        PF
12        SG
13     PG/SG
14        PG
15        PG
16        SG
17        SF
18        PF
19        PG
20        PG
21        SG
22     SF/PF
23      PF/C
24        SG
25        SG
26        PG
27        PG
28        SG
29        SF
       ...  
472    SF/PF
473     PF/C
474       SG
475       PG
476    PG/SG
477    SG/SF
478       PG
479       PG
480    SG/SF
481       PF
482       PG
483       PF
484    SG/SF
485    SG/SF
486       SF
487    SF/PF
488    SF/PF
489        G
490       PF
491       PF
492        C
493       SG
494    SF/PF
495       SG
496    PG/SG
497       PG
498       SG
499    SG/SF
500    SF/PF
501       SF
Name: position, dtype: object

In [40]:
#プレイタイム上位半分のデータを選択
median = df["プレイタイム"].median()
df = df.loc[df["プレイタイム"] >= median]

#ポジションデータを一旦消去
del df["position"]

#データの標準化
df_st = df.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

#選手IDと6manのデータの変更
df_st["選手ID"] = df["選手ID"]
df_st["6man"] = df["6man"]
df_st["position"] = df2["position"]

#行番号の振り直し
df_st = df_st.reset_index(drop = True)

In [41]:
#csv出力
df_st.to_csv("clustering_data.csv")

In [43]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

with codecs.open("clustering_data.csv","r","Shift-JIS","ignore") as file:
    playerdf = pd.read_csv(file)
    
#選手ID,6man,ポジション情報を削除
del playerdf["選手ID"]
del playerdf["6man"]
del playerdf["position"]

#dfをarrayに変換
player = playerdf.as_matrix()

#行番号を削除
player = np.delete(player,0,1)

#最適なクラスタ数の計算
distortions = []

for i in range(1,21):               
    km = KMeans(n_clusters=i).fit(player) 
    distortions.append(km.inertia_)   

#クラスタ内誤差平方和のグラフ表示
plt.plot(range(1,21),distortions,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')

#画像として保存
plt.savefig("clustering_k.png")

In [20]:
#最適(ということにする)クラスタ数でのクラスタリング
km = KMeans(n_clusters=5).fit_predict(player) 

#PCAで次元削減
pca = PCA(n_components=2)
player_r = pca.fit_transform(player)

# 結果を散布図にプロット
plt.figure()
for i in range(0, len(km)):
    if km[i] == 0:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='red')
    elif km[i] == 1:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='green')
    elif km[i] == 2:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='Cyan')
    elif km[i] == 3:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='orange')
    elif km[i] == 4:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='pink')

plt.title("player clustering")
        
sixman = [22, 28, 31, 32, 52, 85, 158, 165, 172, 181, 217, 223]
for i in range(0, len(sixman)):
    plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "blue", marker = "*", markersize = 15)
        
#画像として保存
plt.savefig("clustering_output.png")

In [6]:
#プレイヤーデータに選手ID,クラスタ番号,6manを追加
playerdf["clusterid"] = km
playerdf["選手ID"] = df_st["選手ID"]
del playerdf["Unnamed: 0"]
playerdf["6man"] = df_st["6man"]

#csv出力
playerdf.to_csv("clustering_result_data.csv")

In [7]:
player_c = playerdf

del player_c["選手ID"]
del player_c["6man"]

In [8]:
#各クラスタの特徴
c0 = player_c[player_c['clusterid']==0].mean()
c1 = player_c[player_c['clusterid']==1].mean()
c2 = player_c[player_c['clusterid']==2].mean()
c3 = player_c[player_c['clusterid']==3].mean()
c4 = player_c[player_c['clusterid']==4].mean()

c0.to_csv("cluster0.csv")
c1.to_csv("cluster1.csv")
c2.to_csv("cluster2.csv")
c3.to_csv("cluster3.csv")
c4.to_csv("cluster4.csv")