In [2]:
#データの読込
import numpy as np
import pandas as pd
import codecs

with codecs.open("input_data.csv","r","Shift-JIS","ignore") as file:
    df = pd.read_csv(file)
with codecs.open("input_data.csv","r","Shift-JIS","ignore") as file2:
    df2 = pd.read_csv(file2)

In [3]:
#プレイタイム上位半分のデータを選択
median = df["プレイタイム"].median()
df = df.loc[df["プレイタイム"] >= median]

#ポジションデータを一旦消去
del df["position"]

#データの標準化
df_st = df.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

#選手IDと6manのデータの変更
df_st["選手ID"] = df["選手ID"]
df_st["6man"] = df["6man"]
df_st["position"] = df2["position"]

#行番号の振り直し
df_st = df_st.reset_index(drop = True)

In [4]:
#csv出力
df_st.to_csv("clustering_data.csv")

In [5]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

with codecs.open("clustering_data.csv","r","Shift-JIS","ignore") as file:
    playerdf = pd.read_csv(file)
    
#選手ID,6man,ポジション情報を削除
del playerdf["選手ID"]
del playerdf["6man"]
del playerdf["position"]

#dfをarrayに変換
player = playerdf.as_matrix()

#行番号を削除
player = np.delete(player,0,1)

#最適なクラスタ数の計算
distortions = []

for i in range(1,21):               
    km = KMeans(n_clusters=i).fit(player) 
    distortions.append(km.inertia_)   

#クラスタ内誤差平方和のグラフ表示
plt.plot(range(1,21),distortions,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')

#画像として保存
plt.savefig("clustering_k.png")

In [9]:
#最適(ということにする)クラスタ数でのクラスタリング
km = KMeans(n_clusters=5).fit_predict(player) 

#PCAで次元削減
pca = PCA(n_components=2)
player_r = pca.fit_transform(player)

#ポジションのデータフレームを作成
dfpos = df_st["position"]

# 結果を散布図にプロット
plt.figure()
for i in range(0, len(km)):
    if dfpos[i] == "PG":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='red')
    elif dfpos[i] == "G":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='orange')
    elif dfpos[i] == "PG/SG":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='yellow')
    elif dfpos[i] == "SG":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='greenyellow')
    elif dfpos[i] == "G/F":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='green')
    elif dfpos[i] == "SG/SF":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='darkseagreen')
    elif dfpos[i] == "SF":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='skyblue')
    elif dfpos[i] == "F":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='royalblue')
    elif dfpos[i] == "SF/PF":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='blue')
    elif dfpos[i] == "PF":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='mediumblue')
    elif dfpos[i] == "F/C":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='darkblue')
    elif dfpos[i] == "PF/C":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='blueviolet')
    elif dfpos[i] == "C":
        plt.scatter(player_r[i, 0], player_r[i, 1], c='purple')

plt.title("player clustering")
plt.xlim([-7,15])
plt.ylim([-6,7])
        
sixman = [22, 28, 31, 32, 52, 85, 158, 165, 172, 181, 217, 223]
for i in range(0, len(sixman)):
    if dfpos[sixman[i]] == "PG":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "red", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "G":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "orange", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "PG/SG":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "yellow", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "SG":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "greenyellow", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "G/F":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "green", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "SG/SF":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "darkseagreen", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "SF":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "skyblue", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "F":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "royalblue", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "SF/PF":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "blue", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "PF":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "mediumblue", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "F/C":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "darkblue", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "PF/C":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "blueviolet", marker = "*", markersize = 15)
    elif dfpos[sixman[i]] == "C":
        plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "purple", marker = "*", markersize = 15)
        
#画像として保存
plt.savefig("clustering_output.png")

In [6]:
#プレイヤーデータに選手ID,クラスタ番号,6manを追加
playerdf["clusterid"] = km
playerdf["選手ID"] = df_st["選手ID"]
del playerdf["Unnamed: 0"]
playerdf["6man"] = df_st["6man"]

#csv出力
playerdf.to_csv("clustering_result_data.csv")

In [7]:
player_c = playerdf

del player_c["選手ID"]
del player_c["6man"]

In [8]:
#各クラスタの特徴
c0 = player_c[player_c['clusterid']==0].mean()
c1 = player_c[player_c['clusterid']==1].mean()
c2 = player_c[player_c['clusterid']==2].mean()
c3 = player_c[player_c['clusterid']==3].mean()
c4 = player_c[player_c['clusterid']==4].mean()

c0.to_csv("cluster0.csv")
c1.to_csv("cluster1.csv")
c2.to_csv("cluster2.csv")
c3.to_csv("cluster3.csv")
c4.to_csv("cluster4.csv")