In [10]:
import os
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import decomposition, cluster

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import polyglot
import math
from polyglot.text import Text, Word

In [14]:

__file__ = os.getcwd()
dirname = os.path.dirname(__file__)
input_file = os.path.join(dirname, 'scripts\\data\\dogodki50_strippedOnlySlov.csv')
df = pd.read_csv(input_file, header = 0)
original_headers = list(df.columns.values)
data_opis_normalized = df['opis'].astype('U')



def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('Cluster {}: '.format(i) + ', '.join([labels[t] for t in np.argsort(r)[-n_terms:]]))

In [15]:


no_features = 2000
vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95, min_df=2, max_features=no_features)  #stop_words='english', 
X_idf = vectorizer.fit_transform(data_opis_normalized)


# agglo = cluster.FeatureAgglomeration(n_clusters=100)
# agglo.fit(X_idf.todense())
# X_reduced = agglo.transform(X_idf.todense())
# X_reduced = X_idf.todense()


# We train the PCA on the dense version of the tf-idf. 
pca = PCA(n_components=2)
X = pca.fit_transform(X_idf.todense())

n_clusters = 7

# Initialize the clusterer with n_clusters value
clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1, random_state=1) # random_state = 1 (seed)
cluster_labels = clusterer.fit_predict(X)
centers = clusterer.cluster_centers_

def distance(p0, p1):
    return math.sqrt((p0[0] - p1[0])**2 + (p0[1] - p1[1])**2)

centerDistances = []
for i in range(len(cluster_labels)):
    centerDistances.append(distance(centers[cluster_labels[i]], X[i]))  # distanca med centrom gruče in dogodkom

df['cluster'] = cluster_labels
df['center_distance'] = centerDistances

In [16]:
for i in range(n_clusters):
    print('Cluster ' + str(i) + ':')
    loc_clusterDF = df.loc[df['cluster'] == i]
    loc_clusterDF_sorted = loc_clusterDF.sort_values(by=['cluster', 'center_distance'])
    display(loc_clusterDF_sorted[:20])

Cluster 0:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
1800,Oddrsajmo v 2020,lahek reklo drsališče mesten trg slovenski kon...,2019-12-05 18:00:00,107,709926300000000.0,0,0.001788
180,"Kraljestvo Nabatejcev: Petra, Wadi Rum, Madain...",vabiti potopisen degustacija potovanje jordani...,2020-03-03 19:00:00,121,424706200000000.0,0,0.002393
5217,ROGLA Freestyle Week 2020,pripravljen roči freesyti week glede zima stra...,2020-03-19 09:00:00,611,162885200000000.0,0,0.002828
5760,PUSTOLANDIJA ★ Otroški pustni festival ★ Gospo...,pustolandija otroški pusten festival gospodars...,2020-02-22 09:00:00,1309,214671800000000.0,0,0.002973
1282,Mušje dirke v Levpi 2019,šktd levp vaba mušji dirka levpa nedelja avgus...,2019-08-04 16:00:00,210,1101644000000000.0,0,0.003169
3506,"Martinovanje v Ormožu 2019 - Sobota, 9.11",martinovanje ormož oktober končevati trgatev d...,2019-11-09 08:00:00,132,240564700000000.0,0,0.00317
1718,Astronomsko opazovanje neba v Mirnu,lani letos organizirati javen opazovanje nebo ...,2019-08-12 21:00:00,151,1388677000000000.0,0,0.003205
65,Lajbah praznuje,sobota februar potekel natanko leto odpreti vr...,2020-02-15 16:00:00,210,904388600000000.0,0,0.003778
295,4. RUNDA NLP Lige 2.0 - [Bolder SCENA],preostal sklepen dejanje letošnji nlp liga run...,2020-03-07 10:00:00,118,227403600000000.0,0,0.004016
4864,Preventivni dogodek Reši življenje,ozaveščanje združiti zavod reševalen pas agenc...,2019-08-10 08:00:00,115,265409200000000.0,0,0.004048


Cluster 1:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
3587,Čubed pod borovci - akustični večer,vabiti četrtek avgust borovec vas kubed kjer v...,2019-08-08 21:00:00,143,112043700000000.0,1,0.003692
2318,LUKA BASI,velik koncert dan žena krilo ljubezen možnost ...,2020-03-04 20:00:00,1359,112317000000000.0,1,0.003815
5867,Pearl Jam Tribute – Jeremy's Alive @Bunker,bunker bunker vračati peareti jama tribut jere...,2020-04-04 21:30:00,201,526641000000000.0,1,0.004314
1746,Glasbeni abonma HkC | Leon Firšt: Moj glasbeni...,četrtek naroden dom celj glasben abon hkca leo...,2019-09-19 19:30:00,168,193658600000000.0,1,0.004829
4728,Poletni glasbeni večeri v Hangarju: Jakob Kobal,jakobov glasba lebdeti umirjen sproščen zvok d...,2019-07-23 20:00:00,247,959448300000000.0,1,0.006165
7576,New Swing Quartet & Komorni orkester NOVA / 1....,new swing quarteti komoren orkester nov novole...,2020-01-01 20:00:00,106,507916000000000.0,1,0.007979
6017,"123 Raps VOL 2. (Mrigz'n'Ghet, Fankadelik, Mir...",glasben društvo raps vol ponoven poletje poslo...,2019-09-14 18:00:00,378,312072100000000.0,1,0.008002
3632,KOALA VOICE izštekani,lani kavka maj zažgati kakopaka zato letos zak...,2019-09-05 20:30:00,325,92869130000.0,1,0.008125
1636,"Rap Domačica: Puna kuča w / Emkej, Nite, Tropski",rapa domačica emkej urban večer hip hop scena ...,2019-10-11 23:00:00,436,640113400000000.0,1,0.008764
5541,Hindi Zahra // 26.9 // Musicology Barcaffè Ses...,slovenski zgodba serija koncert musicology bar...,2019-09-26 21:00:00,2158,43956320000.0,1,0.009604


Cluster 2:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
3184,Družbena odgovornost in oglaševanje,družben odgovornost razumeti primer oglaševanj...,2019-11-08 12:00:00,109,21735920000.0,2,0.002002
2235,JAVNI POZIV k prijavi razstavnih umetniških pr...,javen poziv layerjev hiša prijava razstaven pr...,2020-02-02 06:00:00,163,118918900000000.0,2,0.002746
1314,"Tančice intime 2019 | Dogodek, ki neguje odnose",tančica intima dogodek negovati partner znajti...,2019-10-19 18:00:00,257,130655400000000.0,2,0.003975
5380,Predavanje: Glive v dinarskih gozdovih - kakše...,fotografija koralast bradovec hericium corallo...,2019-10-30 18:00:00,178,764575700000000.0,2,0.004191
5910,Sašo Slaček: Temeljni pojmi socializma,predavanje koordinator programski skupina levi...,2019-10-09 18:00:00,134,703686200000000.0,2,0.004207
5475,Marko Kržan: Razredni boj,marka kržati razreden boj četrtek november pre...,2019-11-21 18:00:00,133,732009400000000.0,2,0.004706
4859,Prepih / Razstava Maja Hodošček,vabiti pogovor odprtje razstava maj hodošček j...,2019-06-27 19:00:00,107,599177800000000.0,2,0.004729
7330,Otrok v stiski in vloga pedagoških delavk_cev,vabiti gostujoč predavanje okvir projekt reagi...,2020-02-19 11:00:00,118,2527044000000000.0,2,0.005464
2547,"Težave pri učenju matematike-razumevanje, podp...",matematik predmet imeti družben kulturen prost...,2019-11-11 18:00:00,171,967780900000000.0,2,0.005792
5338,Dobra komunikacija - ključ do uspeha,začeti misel najprej pojav misel sled delovanj...,2019-09-12 18:00:00,252,754331600000000.0,2,0.00585


Cluster 3:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
2035,Kliše Halloween party - Tim Urbanya / Timo G /...,dober zabava halloween vrhunski elektronski gl...,2019-10-26 22:00:00,296,376823500000000.0,3,0.00227
2411,Best of Balkan@Top Six Club (petek 11.10),spomniti hit zid aspirin cren zlaten vreten pe...,2019-10-11 23:00:00,602,331835800000000.0,3,0.00401
1907,NEDA Ukraden + Mladi Gamsi // FAŠENK Lancova v...,fašenka lancov velik otvoritev petek nastopiti...,2020-02-14 20:00:00,805,171440700000000.0,3,0.005209
777,Rnb Mansion w. DJ Dey,pridružiti sam vrelišče rnbje kjer najti beati...,2019-11-09 23:00:00,176,482638700000000.0,3,0.006823
5182,Ritem Balkana I Sobota 7.December I Čili Club,znan besedilo plesen ritem kratek kriti vedno ...,2019-12-07 21:00:00,207,1461682000000000.0,3,0.007473
5159,SHAKE DAT with O3's,izdaja zdaj lep utečen cirkuški nanizanka ljub...,2020-01-18 23:00:00,532,237302900000000.0,3,0.00762
5471,Poizpitna HITčina 19/02,končevati naporen izpiten obdobje zato privošč...,2020-02-19 23:00:00,1289,237302900000000.0,3,0.008179
6959,YUGO Pop Rock w. DJ TINNY,yuga pop rocka sobota januar sobota januar nap...,2020-01-18 22:00:00,103,1253348000000000.0,3,0.0082
5723,El FUEGO,program latina flavored pop danec reggaeton la...,2019-09-28 23:00:00,395,237302900000000.0,3,0.008544
322,"Spring Fest - Luka Basi, Vigor, Nika Zorjan, P...",zabava noč ogrevan šotor hostes vip prostor od...,2020-04-24 20:00:00,1305,106178600000000.0,3,0.008968


Cluster 4:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
3387,Novo leto na Trgu francoske revolucije,december trg francoski revolucija večer barvat...,2019-12-30 19:00:00,801,418434800000000.0,4,0.001883
5744,Zimska pravljica v Višnji Gori,zimski pravljica višnji gora dodaten koncert v...,2019-12-22 20:00:00,261,115058700000000.0,4,0.002452
1190,Zeleno sonce 124: Croatian Beat,soulbrothra slipmati hrvatski funka delegacija...,2019-10-11 23:00:00,165,356008700000000.0,4,0.002586
6497,Christmas Salsastica Party,srečen sals party petek pričakovati odprt rok ...,2019-12-20 22:00:00,372,352458100000000.0,4,0.00276
3437,Vinske Brbončice & ROCK'n ROLL,rezervirati datum vinski brbončica tokrat pred...,2020-01-18 18:00:00,227,370775900000000.0,4,0.002964
4664,Aprilski Master Vinyl,aprilski master vinyti pričakovati ambasada ga...,2020-04-04 21:00:00,230,107877200000000.0,4,0.004395
4483,Danse Macabre // MC Pekarna,vesel december vrata mesec brezglav zlagati sr...,2019-12-21 22:00:00,138,615833100000000.0,4,0.004916
3235,Božično novoletni koncert Pihalnega orkestra V...,član pihalen orkestra vrhpolje tradicionalen p...,2019-12-21 19:00:00,103,112296300000000.0,4,0.005113
1254,"Dražen Zečić & Marko Škugor - Novo mesto, 20.1...",letošnji december posebej pravljičen vrač razp...,2019-12-20 19:00:00,2132,489098600000000.0,4,0.005531
6223,10. Salon Traminec,pripravljen vinskokulinaričen spektakel zdravi...,2019-08-26 17:00:00,732,193526600000000.0,4,0.005871


Cluster 5:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
1779,Muzikal Nune v akciji! | Studenec,poleten gledališče studenec predstava najti ko...,2019-08-23 20:30:00,225,203651300000000.0,5,0.00208
7172,Večer črnega humorja: Black magic,črn humor prihajati nov mesto skupaj neprimere...,2019-10-25 21:00:00,620,189590300000000.0,5,0.002511
945,Tin Vodopivec - Peti element | Klub Bazen Kranj,uspešen turneja rocka roll tina vodopivec vrač...,2019-12-07 20:00:00,360,100172100000000.0,5,0.002604
5200,Tin Vodopivec - Peti element,uspešen turneja rocka roll tina vodopivec vrač...,2019-12-27 20:00:00,106,534788300000000.0,5,0.002604
6681,Tin Vodopivec - Peti element,uspešen turneja rocka roll tina vodopivec vrač...,2020-02-08 20:00:00,263,155016900000000.0,5,0.002604
6785,Tin Vodopivec - Peti element,uspešen turneja rocka roll tina vodopivec vrač...,2020-02-01 20:00:00,123,111676000000000.0,5,0.002604
5278,Reporter Milan – Swingerji,predstava srečen konec država štajerski republ...,2019-08-16 20:00:00,377,518751000000000.0,5,0.003504
6509,Nika Gorič - sopran,zaslišati trajati vstopnica društvo arsan jadr...,2019-07-27 19:30:00,128,138277800000000.0,5,0.003671
1350,Pank tura z Esadom Babačićem (kolesarska),tokrat leninov park trg revolucija potekati ko...,2019-08-16 17:00:00,192,1377444000000000.0,5,0.004037
3086,"Gregor Strniša, Maruša Kink: ŽABE",premiera predstava ponovitev januar močvirje b...,2019-12-26 18:00:00,343,139278000000000.0,5,0.004982


Cluster 6:


Unnamed: 0,naziv,opis,datum_od,stevilo,sifra_prizorisca,cluster,center_distance
6679,Zmelkoow v Podzemlju Pece,vabiti tradicionalen koncert podzemlje pec org...,2019-11-30 17:00:00,202,672845500000000.0,6,0.003271
5272,Nočna smuka z apres ski zabavo za dame,nočen smukati smučišče košut jasa uniorček zab...,2020-03-07 17:00:00,179,171046800000.0,6,0.004658
6081,Spoznavni ŽUR Medicinske fakultete / 10.10. / ...,končen misliti nov študijski leto leto težek p...,2019-10-10 23:00:00,775,237302900000000.0,6,0.004899
7201,Veselica Bašelj - Zvita Feltna,turističen društvo bašelj predstavljati veseli...,2019-08-17 19:00:00,962,2864456000000000.0,6,0.006026
5410,Šagra Guoštovca 2019 & Šagra v Maškarah,petek malonogometen turnir otvoritev šagor obl...,2019-08-02 18:00:00,472,1602198000000000.0,6,0.006273
1956,SG FEST,esga festi vabiti tradicionalen glasben prired...,2019-09-20 20:00:00,1152,2561733000000000.0,6,0.006572
1957,SG FEST,esga festi vabiti tradicionalen glasben prired...,2019-09-20 20:00:00,929,2561733000000000.0,6,0.006572
3166,SHOD KAL // 2019,shod kal petek zvit felten zvit felten sobota ...,2019-08-23 21:00:00,1050,2932229000000000.0,6,0.006658
5488,JAN Plestenjak,žura zamuditi rezervirati sobota videti vrsta,2019-11-16 20:00:00,1009,2055271000000000.0,6,0.007073
4498,Z ansamblom Biseri v veseli december!,leto zavraški gasilec pripravljati pričakati v...,2019-11-23 20:00:00,512,110690400000000.0,6,0.007121
