# kmeans.ipynb
Performs clustering of multidimensional vectors. The number of clusters increases continuously, 
and clustering is performed for each number of clusters. The output is the same as the input, 
with additional columns showing the clusters.

In [1]:
#paraméterek
#inputFile = 'el-diff-dbscan.tsv' # input file name (.tsv)
#outputFile = 'el-diff-clusters.tsv' # output file name (.tsv)

inputFile = 'ki-diff-dbscan.tsv' # input file name (.tsv)
outputFile = 'ki-diff-clusters.tsv' # output file name (.tsv)


clustersNumber = 20 # maximal number of clusters
clusterStep = 1

In [2]:
import pandas as pd
from sklearn.cluster import KMeans

In [3]:
# reading input data
data = pd.read_csv(inputFile, sep='\t')
data

Unnamed: 0,pvform,lemma,pvv,mood,cau,pot,PV,CP_cnd,CP_imp,CP_ind,...,ÉRT,KÉNT,KOR,SZOR,NKÉNT,ADP,ADV,FROM,IN,TO
0,ki,ad,ki+ad,ind,-,+,-0.646132,-0.011485,0.002283,-0.015521,...,-0.002910,-0.006208,0.001593,0.000000,-0.001194,-0.046572,-0.048491,0.008707,-0.109183,0.195846
1,ki,ad,ki+ad,ind,-,-,-0.682139,0.003061,0.003940,-0.031520,...,0.010284,-0.004983,0.001106,0.000293,0.001097,-0.060067,-0.024275,0.007954,-0.154583,0.082377
2,ki,alakít,ki+alakít,ind,-,-,-0.805568,0.000722,0.001324,0.069808,...,-0.000077,-0.003117,-0.001809,-0.000145,-0.000649,-0.083350,-0.016421,-0.052144,-0.176042,-0.033550
3,ki,alakul,ki+alakul,ind,-,+,-0.529124,-0.006371,0.001324,-0.009999,...,-0.000725,0.002790,0.000717,0.000000,0.004789,0.264358,0.047572,-0.034485,0.115629,-0.007977
4,ki,alakul,ki+alakul,ind,-,-,-0.815422,-0.001371,-0.001357,-0.052024,...,-0.001153,-0.005363,-0.001899,0.000000,0.001351,0.247248,0.177070,-0.044549,0.046432,-0.019576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,ki,vált,ki+vált,ind,-,-,-0.853783,-0.004225,-0.000937,0.028687,...,0.002346,0.004731,-0.000140,0.000209,0.015431,-0.008965,0.080577,-0.033868,0.088586,0.178923
74,ki,vár,ki+vár,ind,-,-,-0.124369,0.003087,0.000477,-0.133268,...,0.000557,0.004865,0.001948,0.000000,0.001556,0.007535,-0.045685,0.122698,0.126199,0.164437
75,ki,vet,ki+vet,ind,-,-,-0.704976,-0.003374,-0.002327,-0.020252,...,-0.002110,-0.004526,-0.000046,0.000000,-0.008850,-0.004577,0.005294,-0.039473,-0.069410,0.093608
76,ki,visz,ki+visz,ind,-,-,-0.429151,-0.000944,0.000057,-0.035104,...,0.001402,-0.003614,-0.004179,0.000000,-0.001563,0.028151,0.038024,-0.174115,-0.004423,0.009901


In [4]:
# geting the difference vectors
x = data.iloc[:,7:] 
x

Unnamed: 0,CP_cnd,CP_imp,CP_ind,HKM,inf,nom,acc,dat,BAN,ON,...,ÉRT,KÉNT,KOR,SZOR,NKÉNT,ADP,ADV,FROM,IN,TO
0,-0.011485,0.002283,-0.015521,0.089213,0.000696,0.063584,0.083833,0.119385,-0.049676,-0.048551,...,-0.002910,-0.006208,0.001593,0.000000,-0.001194,-0.046572,-0.048491,0.008707,-0.109183,0.195846
1,0.003061,0.003940,-0.031520,0.073813,0.012231,-0.022146,0.097859,0.126506,-0.089909,-0.073613,...,0.010284,-0.004983,0.001106,0.000293,0.001097,-0.060067,-0.024275,0.007954,-0.154583,0.082377
2,0.000722,0.001324,0.069808,-0.036497,0.001268,0.309068,-0.072415,-0.012997,-0.030790,-0.102532,...,-0.000077,-0.003117,-0.001809,-0.000145,-0.000649,-0.083350,-0.016421,-0.052144,-0.176042,-0.033550
3,-0.006371,0.001324,-0.009999,-0.000635,-0.001684,-0.082512,-0.000106,-0.000125,0.016273,-0.071056,...,-0.000725,0.002790,0.000717,0.000000,0.004789,0.264358,0.047572,-0.034485,0.115629,-0.007977
4,-0.001371,-0.001357,-0.052024,-0.045779,-0.000886,-0.052194,-0.002312,-0.007125,-0.072296,-0.069649,...,-0.001153,-0.005363,-0.001899,0.000000,0.001351,0.247248,0.177070,-0.044549,0.046432,-0.019576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,-0.004225,-0.000937,0.028687,-0.090633,0.000235,0.022743,-0.329410,-0.004875,-0.068547,0.160575,...,0.002346,0.004731,-0.000140,0.000209,0.015431,-0.008965,0.080577,-0.033868,0.088586,0.178923
74,0.003087,0.000477,-0.133268,0.044053,0.002078,0.062775,0.265578,-0.000542,0.055074,0.071990,...,0.000557,0.004865,0.001948,0.000000,0.001556,0.007535,-0.045685,0.122698,0.126199,0.164437
75,-0.003374,-0.002327,-0.020252,0.101386,-0.000191,-0.051201,-0.058149,0.210011,-0.030242,-0.032726,...,-0.002110,-0.004526,-0.000046,0.000000,-0.008850,-0.004577,0.005294,-0.039473,-0.069410,0.093608
76,-0.000944,0.000057,-0.035104,-0.003547,-0.006377,0.104916,0.064053,0.000636,-0.013936,-0.000398,...,0.001402,-0.003614,-0.004179,0.000000,-0.001563,0.028151,0.038024,-0.174115,-0.004423,0.009901


In [5]:
for knum in range(2, clustersNumber, clusterStep):
    kmeans = KMeans(knum, random_state=0)
    kmeans.fit(x)
    identified_clusters = kmeans.fit_predict(x)

    data[str(knum)] = identified_clusters

data.to_csv(outputFile, sep="\t", index=False)