In [30]:
import numpy as np
import pandas as pd

# TEST REGEL

In [31]:
data = 'the cat saw the dog'
max_clusters = 3

## Initialization

In [32]:
class cluster:
    def __init__(self, text):
        self.data = text.split()
        self.vocabulary = self.CreateVocabulary()
        
        
        self.clusterPerWord = self.InitializeClusters()
        self.wordsPerCluster = self.WordsPerCluster()
        
        self.cluster_list = self.ClusterList()
        self.no_cluster = self.cluster_list.size
    
    #The list of all cluster names (keys)
    def ClusterList(self):
        tmp = np.array(list(self.wordsPerCluster.keys()))
        self.no_cluster = tmp.size
        self.cluster_list = tmp
        return tmp

    
    # Vocabulary V
    def CreateVocabulary(self):
        self.vocabulary = np.unique(self.data)
        print(self.vocabulary)
        return self.vocabulary

    # Initiate the clusters (i.e. all words own cluster)
    # The cluster is a dict {word: cluster}
    def InitializeClusters(self):
        V = self.vocabulary.tolist()
        C = {v: V.index(v)+1 for v in V}
        C['START'] = 0
        return C
    
    # returns the size of the cluster to which wi belongs
    def n_wi(self, wi):
        ci = self.clusterPerWord[wi]
        C_values = list(self.clusterPerWord.values())
        return pd.Series(C_values).value_counts()[ci] 

    # no. times wi and wj are next to each other in the text
    def p_ci_cj(self, wi, wj):
        ci = self.clusterPerWord[wi]
        cj = self.clusterPerWord[wj]
        prev_word = 'START'
        count = 0
        for word in self.data:
            if self.clusterPerWord[word] == ci and self.clusterPerWord[prev_word] == cj:
                count += 1
            prev_word = word
        return count


    # measures how well clust distribution C, fits the sentence/text s
    def Quality(self):
        n_ = len(self.data)
        temp = 0
        for i in range(n_):
            wi = self.data[i]# wi: current word
            if i == 0:
                wj = 'START'
            else:
                wj = self.data[i-1] # wj: previous word

            temp_n =  self.p_ci_cj(wi, wj)/n_
            temp_m = (temp_n * n_) / (self.n_wi(wi) * self.n_wi(wj))
            temp += temp_n*np.log(temp_m)

            p_c = self.n_wi(wi)/n_
            p_c_prime = self.n_wi(wj)/n_
            temp_new = temp_n * np.log(temp_n/(p_c*p_c_prime))
        return temp_new
    
    # Calculates which words belong to what clusters. Returns a dict {cluster: words}
    def WordsPerCluster(self):
        C = self.clusterPerWord
        C_new = dict()
        for k,v in C.items():
            if v in C_new:
                C_new[v].append(k)
            else:
                C_new[v] = [k]
        self.wordsPerCluster = C_new
        return C_new
    
    def ClusterPerWord(self):
        C = self.wordsPerCluster
        c_tmp = {}
        for cluster, word in C.items():
            for i in word:
                c_tmp[i] = cluster
        self.clusterPerWord = c_tmp
        
    
    # Change one cluster to the other
    def ChangeCluster(self, cluster_from, cluster_to):
        cluster_from_values = self.wordsPerCluster[cluster_from]
        cluster_to_values = self.wordsPerCluster[cluster_to]
        new_values = np.ravel(cluster_from_values + cluster_to_values).tolist()
        
        self.wordsPerCluster.pop(cluster_from)
        self.wordsPerCluster.pop(cluster_to)
        
        
        self.wordsPerCluster[cluster_to] = new_values
        self.ClusterPerWord()
        self.WordsPerCluster()
        self.ClusterList()
        return
        


c = cluster(data)
print("eerste")
print(c.Quality())
print(c.clusterPerWord)
print(c.wordsPerCluster)
print(c.WordsPerCluster())
print(c.wordsPerCluster)
c.Quality()
c.ClusterPerWord()


['cat' 'dog' 'saw' 'the']
eerste
0.3218875824868201
{'cat': 1, 'dog': 2, 'saw': 3, 'the': 4, 'START': 0}
{1: ['cat'], 2: ['dog'], 3: ['saw'], 4: ['the'], 0: ['START']}
{1: ['cat'], 2: ['dog'], 3: ['saw'], 4: ['the'], 0: ['START']}
{1: ['cat'], 2: ['dog'], 3: ['saw'], 4: ['the'], 0: ['START']}


## Make perfect clusters
Naively, brute force

In [37]:
import copy
import time
data = "the dog saw the cat and the dog saw the man the dog and the cat are owned by the man"

best_cluster = cluster(data)
max_clusters = 3
prev_no_cluster = best_cluster.no_cluster + 1

count_iter = 0
current_clusters = best_cluster.no_cluster

print(prev_no_cluster, current_clusters)
T = time.time()
while current_clusters > max_clusters:
    tmp_cluster = copy.deepcopy(best_cluster)
    current_cluster_list = best_cluster.cluster_list
    iter_best = 0
    print(type(current_cluster_list))
    for from_c in current_cluster_list:
        for to_c in current_cluster_list:
            if (from_c != to_c):
                current_cluster = copy.deepcopy(tmp_cluster)
                current_cluster.ChangeCluster(from_c, to_c)
                if current_cluster.Quality() > iter_best:
                    best_cluster = current_cluster
                    iter_best = current_cluster.Quality()
               
    best_cluster.ClusterList()
    count_iter += 1
    
    prev_no_cluster = current_clusters
    current_clusters = best_cluster.no_cluster
    print(f"New clusters: {best_cluster.wordsPerCluster} - score: {round(best_cluster.Quality(),2)}" )
        

print(f"Best cluster: {best_cluster.wordsPerCluster}")
print(time.time() - T)
# c.Quality()
# c.ChangeCluster(1,2)
# c.Quality()
# print(c.wordsPerCluster)

# c.ChangeCluster(2,3)
# c.Quality()
# print(c.wordsPerCluster)



['and' 'are' 'by' 'cat' 'dog' 'man' 'owned' 'saw' 'the']
11 10
<class 'numpy.ndarray'>
New clusters: {1: ['and'], 2: ['are'], 3: ['by'], 4: ['cat'], 7: ['owned'], 8: ['saw'], 9: ['the'], 0: ['START'], 6: ['dog', 'man']} - score: 0.94
<class 'numpy.ndarray'>
New clusters: {1: ['and'], 2: ['are'], 3: ['by'], 7: ['owned'], 8: ['saw'], 9: ['the'], 0: ['START'], 6: ['cat', 'dog', 'man']} - score: 1.3
<class 'numpy.ndarray'>
New clusters: {3: ['by'], 7: ['owned'], 8: ['saw'], 9: ['the'], 0: ['START'], 6: ['cat', 'dog', 'man'], 2: ['and', 'are']} - score: 1.3
<class 'numpy.ndarray'>
New clusters: {8: ['saw'], 9: ['the'], 0: ['START'], 6: ['cat', 'dog', 'man'], 2: ['and', 'are'], 7: ['by', 'owned']} - score: 1.3
<class 'numpy.ndarray'>
New clusters: {9: ['the'], 6: ['cat', 'dog', 'man'], 2: ['and', 'are'], 7: ['by', 'owned'], 0: ['saw', 'START']} - score: 1.3
<class 'numpy.ndarray'>
New clusters: {9: ['the'], 6: ['cat', 'dog', 'man'], 0: ['saw', 'START'], 7: ['and', 'are', 'by', 'owned']} - sc

In [279]:
best_cluster.wordsPerCluster

{3: ['saw'], 4: ['the'], 0: ['START'], 2: ['cat', 'dog']}

In [161]:
# returns the size of the cluster to which wi belongs
def n_wi(wi, C_):
    ci = C_[wi]
    C_values = list(C_.values())
    return pd.Series(C_values).value_counts()[ci] 

# no. times wi and wj are next to each other in the text
def p_ci_cj(wi, wj, sentence, C_):
    ci = C_[wi]
    cj = C_[wj]
    prev_word = 'START'
    count = 0
    for word in sentence:
        if C_[word] == ci and C_[prev_word] == cj:
            count += 1
        prev_word = word
    return count


# measures how well clust distribution C, fits the sentence/text s
def Quality_new(C_, text):
    n_ = len(text)
    temp = 0
    for i in range(n_):
        wi = text[i]# wi: current word
        if i == 0:
            wj = 'START'
        else:
            wj = text[i-1] # wj: previous word
        
        temp_n =  p_ci_cj(wi, wj, text, C_)/n_
        temp_m = (temp_n * n_) / (n_wi(wi, C_) * n_wi(wj, C_))
        temp += temp_n*np.log(temp_m)
        
        p_c = n_wi(wi, C_)/n_
        p_c_prime = n_wi(wj, C_)/n_
        temp_new = temp_n * np.log(temp_n/(p_c*p_c_prime))
    return temp_new
        
s = 'the dog saw the cat'
Quality_new(C, s.split())

    

NameError: name 'C' is not defined

In [None]:
C2 = C.copy()
C2['cat'] = 2
C['cat'] = 40
print(f"C2: {C2} \n C: {C}")

s = 'the dog saw the cat'
print(f"C2: {Quality_new(C2, s.split())}")
print(f"C: {Quality_new(C, s.split())}")

## Selecting iteratively the clusters until max clustersize k = x


In [None]:
# max cluster size k
k = 4

### Naively
Greedily trying all possible combinations of clusters

In [None]:
# Returns the maximum value of the cluster 

def GetClusters(C_):
    C_values = C_.values()
    return list(set(list(C_values)))


clusters = GetClusters(C)

for i in C.keys():
    for j in C.keys():
        C