In [1]:
#データの読込
import numpy as np
import pandas as pd
import codecs

with codecs.open("input_data.csv","r","Shift-JIS","ignore") as file:
    df = pd.read_csv(file)

In [2]:
#プレイタイム上位半分のデータを選択
median = df["プレイタイム"].median()
df = df.loc[df["プレイタイム"] >= median]

#データの標準化
df_st = df.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

#選手IDと6manのデータの変更
df_st["選手ID"] = df["選手ID"]
df_st["6man"] = df["6man"]

#行番号の振り直し
df_st = df_st.reset_index(drop = True)

df_st

Unnamed: 0,選手ID,スターティングフラグ,プレイタイム,得点,3P成功,3P試投,2P成功,2P試投,ダンク,フリースロー成功,...,アシスト,スティール,ブロックショット,被ブロックショット,ファストブレイクポイント,ポイントフロムターンオーバー,2Pインサイドポイント,セカンドチャンスポイント,6man,period
0,8446,-0.647309,-0.639868,-0.772244,-0.636158,-0.587716,-0.408355,-0.476046,-0.486037,-0.677279,...,0.488694,-0.193628,-0.734868,-1.123465,-1.016639,-0.884619,-0.870567,-0.187231,0,-0.882702
1,8447,-0.736174,-0.795194,-0.450415,0.314616,0.415048,-0.558849,-0.613210,-0.486037,-0.677279,...,0.734581,1.857230,-0.373591,-0.650169,0.876977,0.598468,-0.434321,-0.545651,0,-0.619608
2,8448,0.152475,-0.436682,-0.676114,-0.147189,-0.196638,-0.472852,-0.666882,-0.425100,-1.011226,...,-0.969062,-0.098239,-0.614443,-0.650169,0.738420,1.022208,-0.292835,-0.250482,0,-0.586721
3,8450,1.041123,1.290049,0.038599,0.396111,0.826182,-0.354607,-0.225573,-0.486037,0.553055,...,1.472241,3.812698,-0.253166,1.526992,1.107906,2.251051,-0.104188,-0.229398,0,0.860298
4,8452,0.656042,1.322012,1.171272,0.042966,-0.066278,1.236328,0.961192,2.987356,0.799121,...,1.033158,1.571064,1.392652,0.864378,1.385021,1.700190,1.310666,1.647032,0,0.794524
5,8454,-1.387850,-1.541013,-1.520394,-0.690488,-0.507494,-1.289819,-1.346738,-0.486037,-0.835464,...,-0.529979,-0.909043,-0.734868,-0.934147,-1.109010,-0.969367,-1.047424,-1.220323,0,-1.310231
6,8457,-1.121255,-1.096401,-0.926890,-0.446003,-0.607771,-0.784590,-1.018737,-0.486037,-0.466364,...,0.154991,0.426399,-0.815152,-0.934147,2.216365,0.089981,-0.516854,-0.925154,0,0.169675
7,8458,-0.943525,-1.601658,-1.114972,-1.125127,-1.119180,-0.548099,-0.344846,-0.486037,-0.712431,...,-0.969062,-1.147515,-0.333450,-0.366192,-1.016639,-0.969367,-0.811615,-0.714319,0,-2.724363
8,8459,0.596799,0.543365,-0.818220,0.694925,0.585518,-1.268320,-1.328847,-0.486037,-0.642126,...,-1.214948,-0.384405,-0.574301,-0.555510,-1.201382,-1.054115,-1.141748,-1.051655,0,0.794524
9,8460,-0.943525,-1.042840,-1.236181,-1.016468,-0.898572,-0.752341,-0.702664,-0.486037,-0.765160,...,-0.459725,-0.289016,-0.775010,-0.934147,-0.554781,-0.672749,-0.941310,-0.861903,0,0.564316


In [3]:
#csv出力
df_st.to_csv("clustering_data.csv")

In [4]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

with codecs.open("clustering_data.csv","r","Shift-JIS","ignore") as file:
    playerdf = pd.read_csv(file)
    
#選手IDを削除
del playerdf["選手ID"]
del playerdf["6man"]

#dfをarrayに変換
player = playerdf.as_matrix()

#行番号を削除
player = np.delete(player,0,1)

#最適なクラスタ数の計算
distortions = []

for i  in range(1,21):               
    km = KMeans(n_clusters=i).fit(player) 
    distortions.append(km.inertia_)   

#クラスタ内誤差平方和のグラフ表示
plt.plot(range(1,21),distortions,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')

#画像として保存
plt.savefig("clustering_k.png")

In [20]:
#最適(ということにする)クラスタ数でのクラスタリング
km = KMeans(n_clusters=5).fit_predict(player) 

#PCAで次元削減
pca = PCA(n_components=2)
player_r = pca.fit_transform(player)

# 結果を散布図にプロット
plt.figure()
for i in range(0, len(km)):
    if km[i] == 0:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='red')
    elif km[i] == 1:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='green')
    elif km[i] == 2:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='Cyan')
    elif km[i] == 3:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='orange')
    elif km[i] == 4:
        plt.scatter(player_r[i, 0], player_r[i, 1], c='pink')

plt.title("player clustering")
        
sixman = [22, 28, 31, 32, 52, 85, 158, 165, 172, 181, 217, 223]
for i in range(0, len(sixman)):
    plt.plot(player_r[sixman[i],0], player_r[sixman[i],1], color = "blue", marker = "*", markersize = 15)
        
#画像として保存
plt.savefig("clustering_output.png")

In [6]:
#プレイヤーデータに選手ID,クラスタ番号,6manを追加
playerdf["clusterid"] = km
playerdf["選手ID"] = df_st["選手ID"]
del playerdf["Unnamed: 0"]
playerdf["6man"] = df_st["6man"]

#csv出力
playerdf.to_csv("clustering_result_data.csv")

In [7]:
player_c = playerdf

del player_c["選手ID"]
del player_c["6man"]

In [8]:
#各クラスタの特徴
c0 = player_c[player_c['clusterid']==0].mean()
c1 = player_c[player_c['clusterid']==1].mean()
c2 = player_c[player_c['clusterid']==2].mean()
c3 = player_c[player_c['clusterid']==3].mean()
c4 = player_c[player_c['clusterid']==4].mean()

c0.to_csv("cluster0.csv")
c1.to_csv("cluster1.csv")
c2.to_csv("cluster2.csv")
c3.to_csv("cluster3.csv")
c4.to_csv("cluster4.csv")