# DBScan.ipynb
Separates outliers as an idiomatic construction

In [1]:
#parameters
inputFile = 'ki-diff.tsv'          # input file name (.tsv) 
outputFile = 'ki-diff-dbscan.tsv'  # output: compositional constructions
outputFile2 = 'ki-diff-idioms.tsv' # output: idiomatic constructions
# change 'ki' to 'el' for the other dataset!

In [2]:
import pandas as pd
from sklearn.cluster import DBSCAN

In [3]:
# reading input data
data = pd.read_csv(inputFile, sep='\t')
data

Unnamed: 0,pvform,lemma,pvv,mood,cau,pot,PV,CP_cnd,CP_imp,CP_ind,...,ÉRT,KÉNT,KOR,SZOR,NKÉNT,ADP,ADV,FROM,IN,TO
0,ki,ad,ki+ad,ind,-,+,-0.646132,-0.011485,0.002283,-0.015521,...,-0.002910,-0.006208,0.001593,0.000000,-0.001194,-0.046572,-0.048491,0.008707,-0.109183,0.195846
1,ki,ad,ki+ad,ind,-,-,-0.682139,0.003061,0.003940,-0.031520,...,0.010284,-0.004983,0.001106,0.000293,0.001097,-0.060067,-0.024275,0.007954,-0.154583,0.082377
2,ki,alakít,ki+alakít,ind,-,-,-0.805568,0.000722,0.001324,0.069808,...,-0.000077,-0.003117,-0.001809,-0.000145,-0.000649,-0.083350,-0.016421,-0.052144,-0.176042,-0.033550
3,ki,alakul,ki+alakul,ind,-,+,-0.529124,-0.006371,0.001324,-0.009999,...,-0.000725,0.002790,0.000717,0.000000,0.004789,0.264358,0.047572,-0.034485,0.115629,-0.007977
4,ki,alakul,ki+alakul,ind,-,-,-0.815422,-0.001371,-0.001357,-0.052024,...,-0.001153,-0.005363,-0.001899,0.000000,0.001351,0.247248,0.177070,-0.044549,0.046432,-0.019576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,ki,veszik,ki+veszik,ind,-,-,-0.168734,-0.012407,-0.008685,0.073525,...,-0.002481,-0.003722,-0.002481,0.000000,0.039756,0.191391,-0.107671,-0.420542,0.083396,0.401284
92,ki,vet,ki+vet,ind,-,-,-0.704976,-0.003374,-0.002327,-0.020252,...,-0.002110,-0.004526,-0.000046,0.000000,-0.008850,-0.004577,0.005294,-0.039473,-0.069410,0.093608
93,ki,visz,ki+visz,ind,-,-,-0.429151,-0.000944,0.000057,-0.035104,...,0.001402,-0.003614,-0.004179,0.000000,-0.001563,0.028151,0.038024,-0.174115,-0.004423,0.009901
94,ki,von,ki+von,ind,-,-,-0.490433,-0.003047,0.000509,0.014885,...,0.012652,-0.003047,-0.000789,0.000000,0.000926,0.093215,-0.051066,-0.783797,0.043289,0.378963


In [4]:
# geting the difference vectors
x = data.iloc[:,6:] 
x

Unnamed: 0,PV,CP_cnd,CP_imp,CP_ind,HKM,inf,nom,acc,dat,BAN,...,ÉRT,KÉNT,KOR,SZOR,NKÉNT,ADP,ADV,FROM,IN,TO
0,-0.646132,-0.011485,0.002283,-0.015521,0.089213,0.000696,0.063584,0.083833,0.119385,-0.049676,...,-0.002910,-0.006208,0.001593,0.000000,-0.001194,-0.046572,-0.048491,0.008707,-0.109183,0.195846
1,-0.682139,0.003061,0.003940,-0.031520,0.073813,0.012231,-0.022146,0.097859,0.126506,-0.089909,...,0.010284,-0.004983,0.001106,0.000293,0.001097,-0.060067,-0.024275,0.007954,-0.154583,0.082377
2,-0.805568,0.000722,0.001324,0.069808,-0.036497,0.001268,0.309068,-0.072415,-0.012997,-0.030790,...,-0.000077,-0.003117,-0.001809,-0.000145,-0.000649,-0.083350,-0.016421,-0.052144,-0.176042,-0.033550
3,-0.529124,-0.006371,0.001324,-0.009999,-0.000635,-0.001684,-0.082512,-0.000106,-0.000125,0.016273,...,-0.000725,0.002790,0.000717,0.000000,0.004789,0.264358,0.047572,-0.034485,0.115629,-0.007977
4,-0.815422,-0.001371,-0.001357,-0.052024,-0.045779,-0.000886,-0.052194,-0.002312,-0.007125,-0.072296,...,-0.001153,-0.005363,-0.001899,0.000000,0.001351,0.247248,0.177070,-0.044549,0.046432,-0.019576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,-0.168734,-0.012407,-0.008685,0.073525,-0.031017,-0.004963,-0.103733,0.464937,0.034793,0.023681,...,-0.002481,-0.003722,-0.002481,0.000000,0.039756,0.191391,-0.107671,-0.420542,0.083396,0.401284
92,-0.704976,-0.003374,-0.002327,-0.020252,0.101386,-0.000191,-0.051201,-0.058149,0.210011,-0.030242,...,-0.002110,-0.004526,-0.000046,0.000000,-0.008850,-0.004577,0.005294,-0.039473,-0.069410,0.093608
93,-0.429151,-0.000944,0.000057,-0.035104,-0.003547,-0.006377,0.104916,0.064053,0.000636,-0.013936,...,0.001402,-0.003614,-0.004179,0.000000,-0.001563,0.028151,0.038024,-0.174115,-0.004423,0.009901
94,-0.490433,-0.003047,0.000509,0.014885,0.054230,0.001346,0.176474,-0.165672,0.011346,-0.085834,...,0.012652,-0.003047,-0.000789,0.000000,0.000926,0.093215,-0.051066,-0.783797,0.043289,0.378963


In [5]:
for M in range(2,5): # number of neighbours
    for Rint in range(100, 200):
        R = Rint/200 # maximal distance of neighbours
        db = DBSCAN(eps=R, min_samples=M)
        db.fit(x)
        labels = db.labels_

        
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # Number of clusters
        n_outl_ = list(labels).count(-1) # number of outliers

        print(f"R={R}\t M={M} \t clusters:{n_clusters_} \t outliers:{n_outl_}", end='\t')
        if n_outl_ < 20:
            print(list(data[labels == -1]["pvv"]))
        else:
            print('')

R=0.5	 M=2 	 clusters:3 	 outliers:27	
R=0.505	 M=2 	 clusters:3 	 outliers:26	
R=0.51	 M=2 	 clusters:3 	 outliers:26	
R=0.515	 M=2 	 clusters:3 	 outliers:26	
R=0.52	 M=2 	 clusters:3 	 outliers:25	
R=0.525	 M=2 	 clusters:4 	 outliers:21	
R=0.53	 M=2 	 clusters:3 	 outliers:20	
R=0.535	 M=2 	 clusters:3 	 outliers:20	
R=0.54	 M=2 	 clusters:3 	 outliers:18	['ki+bocsát', 'ki+derül', 'ki+egészül', 'ki+fejez', 'ki+indul', 'ki+ír', 'ki+kap', 'ki+köt', 'ki+merül', 'ki+néz', 'ki+nő', 'ki+számít', 'ki+tart', 'ki+terjed', 'ki+tesz', 'ki+tűnik', 'ki+veszik', 'ki+von']
R=0.545	 M=2 	 clusters:3 	 outliers:18	['ki+bocsát', 'ki+derül', 'ki+egészül', 'ki+fejez', 'ki+indul', 'ki+ír', 'ki+kap', 'ki+köt', 'ki+merül', 'ki+néz', 'ki+nő', 'ki+számít', 'ki+tart', 'ki+terjed', 'ki+tesz', 'ki+tűnik', 'ki+veszik', 'ki+von']
R=0.55	 M=2 	 clusters:3 	 outliers:17	['ki+bocsát', 'ki+derül', 'ki+egészül', 'ki+fejez', 'ki+indul', 'ki+ír', 'ki+kap', 'ki+köt', 'ki+merül', 'ki+néz', 'ki+számít', 'ki+tart', 'ki+te

In [6]:
# with verbal modifier 'el': best outliers as idioms with M=2, R=0.55
# with verbal modifier 'ki': best outliers as idioms with M=2, R=0.54
M = 2
R = 0.54
db = DBSCAN(eps=R, min_samples=M)
db.fit(x)
labels = db.labels_

data[labels != -1].to_csv(outputFile, sep='\t', index=False)
data[labels == -1].to_csv(outputFile2, sep='\t', index=False)

In [7]:
print("idioms")
data[labels == -1]

idioms


Unnamed: 0,pvform,lemma,pvv,mood,cau,pot,PV,CP_cnd,CP_imp,CP_ind,...,ÉRT,KÉNT,KOR,SZOR,NKÉNT,ADP,ADV,FROM,IN,TO
8,ki,bocsát,ki+bocsát,ind,-,-,-0.87073,0.001191,0.002302676,-0.081851,...,0.000985,-0.001381,-0.002281,0.0,-0.009374,-0.049106,0.017704,-0.020618,-0.143489,0.803199
11,ki,derül,ki+derül,ind,-,-,-0.305946,-0.009006,0.001895196,-0.070393,...,-0.000231,0.001895,-0.000955,-0.000129,0.000856,0.04285,0.107231,-0.268727,0.150619,0.571753
16,ki,egészül,ki+egészül,ind,-,-,-0.435667,-0.006,-0.001666667,0.12,...,-0.001,-0.005667,-0.003333,0.0,-0.002667,0.247,0.099,-0.031333,0.121,-0.043667
23,ki,fejez,ki+fejez,ind,-,+,-0.432273,-0.015312,-0.002355713,-0.179034,...,-0.005889,-0.014134,-0.001178,0.0,-0.001178,0.077503,0.478445,-0.027091,-0.073263,-0.023557
35,ki,indul,ki+indul,ind,-,-,-0.958697,0.000411,-0.0001518487,0.076356,...,0.028449,0.021456,0.023747,0.000158,0.009019,0.113728,0.13542,-0.346848,0.308444,0.158899
36,ki,ír,ki+ír,ind,-,-,-0.71435,0.013204,0.001859194,0.224929,...,-0.003386,0.001666,-0.003019,0.000148,-1.3e-05,-0.075274,-0.079066,0.035607,-0.1001,-0.317718
43,ki,kap,ki+kap,ind,-,-,-0.352919,0.001317,0.004162995,0.04266,...,0.022259,-0.010238,0.003362,0.0,0.004361,-0.000908,0.014521,-0.376457,-0.348851,-0.201697
48,ki,köt,ki+köt,ind,-,-,-0.66199,0.004212,-6.756574e-08,-0.019288,...,-0.001748,-0.007831,-0.006979,0.0,0.000871,-0.047404,-0.17372,0.005469,-0.525061,0.38838
57,ki,merül,ki+merül,ind,-,-,-0.462374,-0.005085,0.0008484093,0.004726,...,0.001859,0.006043,0.005424,0.0,0.000696,0.068333,0.011311,0.007297,-0.23188,0.674036
61,ki,néz,ki+néz,ind,-,-,-0.845098,-0.012997,0.002807477,0.030436,...,0.00014,-0.002201,-0.000467,-1.3e-05,0.000874,0.02208,-0.403235,0.002161,-0.062503,0.112358
