# K-means 法の実験

In [None]:
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
from matplotlib import font_manager
from sklearn.cluster import KMeans
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
# %matplotlib　notebook
%matplotlib　inline

In [None]:
# ダミーデータの生成　　　　出席点、課題点、試験点
np.random.seed(1)
data1 = np.round(np.random.multivariate_normal([35,28,35],[[3,0.5,1],[0.5,3,0.5],[0.5,0.5,3]],10),1) # すべてよい
data2 = np.round(np.random.multivariate_normal([30,26,25],[[3,0.5,0],[0.5,5,0],[0,0,4]], 20),1) #  出席と課題はよいが試験はいまいち
data3 = np.round(np.random.multivariate_normal([25,20,20],[[5,0,0],[0,5,0],[0,0,4]], 20),1)  #  凡庸
data4 = np.round(np.random.multivariate_normal([35,8,15],[[3,0,0],[0,4,0],[0,0,5]], 10),1)  #  出席だけ
data5 = np.round(np.random.multivariate_normal([8,8,8],[[2,1,1],[1,3,1],[1,1,2]], 10),1) # ダメダメ
data = np.r_[data1,data2,data3,data4,data5]
rank = np.r_[np.array(['A']*10),np.array(['B']*20),np.array(['C']*20),np.array(['D']*10),np.array(['E']*10)]
df = pd.DataFrame(data,columns=['Shusseki','Kadai','Siken'])
df['Rank']=rank
df= df.reindex(np.random.permutation(df.index)).reset_index(drop=True)
df.head(10) #  頭から 10 個だけ表示

# ２次元散布図のプロット

In [None]:
# 新規のウィンドウを描画
fig = plt.figure(figsize=(8,6))
# サブプロットを追加
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
ax1.hist(df['Siken'],bins=20,color='k',alpha=0.3)
ax2.scatter(df['Shusseki'],df['Siken'])
ax2.set_xlabel("Shusseki")
ax2.set_ylabel("Siken")
ax3.scatter(df['Shusseki'],df['Kadai'])
ax3.set_xlabel("Shusseki")
ax3.set_ylabel("Kadai")
ax4.scatter(df['Kadai'],df['Siken'])
ax4.set_xlabel("Kadai")
ax4.set_ylabel("Siken")
plt.show()
plt.savefig("image.png")

In [None]:
dx,dy,dz,rk = np.array(df['Shusseki']),np.array(df['Kadai']), np.array(df['Siken']),np.array(df['Rank'])
fig = plt.figure()
ax = Axes3D(fig)
ax.set_xlabel("Shusseki")
ax.set_ylabel("Kadai")
ax.set_zlabel("Shiken")
ax.scatter3D(dx[df['Rank']=='A'], dy[df['Rank']=='A'], dz[df['Rank']=='A'], c="green")
ax.scatter3D(dx[df['Rank']=='B'], dy[df['Rank']=='B'], dz[df['Rank']=='B'], c="blue")
ax.scatter3D(dx[df['Rank']=='C'], dy[df['Rank']=='C'], dz[df['Rank']=='C'], c="orange")
ax.scatter3D(dx[df['Rank']=='D'], dy[df['Rank']=='D'], dz[df['Rank']=='D'], c="purple")
ax.scatter3D(dx[df['Rank']=='E'], dy[df['Rank']=='E'], dz[df['Rank']=='E'], c="red")
plt.show()

# 数値データ部分だけを抽出¶

In [None]:
data = df.T[0:3].T
data.head()

# K-means　法によるクラスタリング

In [None]:
def d(n,itr):
    global result
    km = KMeans(n_clusters=n,init='random',n_init=1,max_iter=itr,random_state=1)
    result = km.fit(data)
    fig = plt.figure()
    ax = Axes3D(fig)
    # 
    ax.set_xlabel("Shusseki")
    ax.set_ylabel("Kadai")
    ax.set_zlabel("Shiken")
    
    print(np.round([km.cluster_centers_[:,0],km.cluster_centers_[:,1],km.cluster_centers_[:,2]],2))
    ax.scatter3D(km.cluster_centers_[:,0],km.cluster_centers_[:,1],km.cluster_centers_[:,2],marker='X',c='black')
    ax.scatter3D(dx[result.labels_==0.0], dy[result.labels_==0.0], dz[result.labels_==0.0], c="green")
    ax.scatter3D(dx[result.labels_==1.0], dy[result.labels_==1.0], dz[result.labels_==1.0], c="blue")
    if n > 2:
        ax.scatter3D(dx[result.labels_==2.0], dy[result.labels_==2.0], dz[result.labels_==2.0],c="orange")
    if n > 3:
        ax.scatter3D(dx[result.labels_==3.0], dy[result.labels_==3.0], dz[result.labels_==3.0],c="purple")
    if n > 4:
        ax.scatter3D(dx[result.labels_==4.0], dy[result.labels_==4.0], dz[result.labels_==4.0],c="yellow")
    if n > 5:
        ax.scatter3D(dx[result.labels_==5.0], dy[result.labels_==5.0], dz[result.labels_==5.0],c="red")   
    plt.show()
    return result

## 2ランクに分けるなら

In [None]:
d(2,20)

## 3クラスタに分類

In [None]:
d(3,100)

## ４クラスタに分類

In [None]:
d(4,100)

# ５クラスタに分類

In [None]:
d(5,100)

# クラスタリング過程

In [None]:
for i in range(20):
    d(5,i+1)

In [None]:
d(5,100)

# 設定したランクと生成されたクラスタの比較

In [None]:
df2 = df.copy()
df2['AutoLabel'] = result.labels_
df2