In [1]:
import spacy

nlp = spacy.load('en_core_web_md')




In [66]:
import numpy as np

from sklearn.cluster import KMeans


In [95]:
# answers to holidays
docs = [
    'I went to France to visit Paris',
    'New York City was crowded',
    'sightseeing in Bali',
    'swimming in the tropical oceans',
    'swimming in Bali',
    'raining in Paris',
    'visiting monuments in Paris',
    'San Francisco',
    'San Francisco',
    'flew to Berlin',
    'enjoyed visiting eiffel tower in paris',
    'Paris, city of love',
    'Flying is fun',
    'Paris is crowded',
    'France'
]


In [96]:
# Remove stop words

def stop_word_stripper(line):
    stop_words = [w.strip('\n').lower() for w in open('stop_words.txt').readlines()]
    pos_stopper = ['PUNCT', 'SYM']
    return ' '.join([token.text for token in line if str(token).lower() not in stop_words and token.pos_  not in pos_stopper])

stripped_docs = [] #spacy object
copy_docs = [] # strings
for d in docs:
    stripped_docs.append(nlp(stop_word_stripper(nlp(d))))
    copy_docs.append(stop_word_stripper(nlp(d)))
    
print('stripped docs', stripped_docs)

stripped docs [went France visit Paris, New York City crowded, sightseeing Bali, swimming tropical oceans, swimming Bali, raining Paris, visiting monuments Paris, San Francisco, San Francisco, flew Berlin, enjoyed visiting eiffel tower paris, Paris city love, Flying fun, Paris crowded, France]


In [97]:
# parse through to get entities 
kw_freq = {}

for i in range(len(stripped_docs)):
    line = stripped_docs[i]
    for e in line.ents:
        copy_docs[i] = copy_docs[i].replace(e.text, '').strip()
        if e.text in kw_freq:
            kw_freq[e.text] += 1
        else:
            kw_freq[e.text] = 1

print(kw_freq, copy_docs)

{'France': 2, 'Paris': 5, 'New York City': 1, 'Bali': 2, 'San Francisco': 2, 'Berlin': 1} ['went  visit', 'crowded', 'sightseeing', 'swimming tropical oceans', 'swimming', 'raining', 'visiting monuments', '', '', 'flew', 'enjoyed visiting eiffel tower paris', 'city love', 'Flying fun', 'crowded', '']


In [98]:
# get lemma keywords 
# join the rest of the words together: 

corpus = nlp(' '.join(copy_docs))

MIN_CHARACTERS = 3

for token in corpus:
    if len(token.lemma_) > MIN_CHARACTERS:
        if token.lemma_ in kw_freq:
            kw_freq[token.lemma_] += 1
        else:
            kw_freq[token.lemma_] = 1

print(kw_freq)

{'France': 2, 'Paris': 5, 'New York City': 1, 'Bali': 2, 'San Francisco': 2, 'Berlin': 1, 'visit': 3, 'crowded': 1, 'sightseeing': 1, 'swim': 2, 'tropical': 1, 'ocean': 1, 'rain': 1, 'monument': 1, 'enjoy': 1, 'eiffel': 1, 'tower': 1, 'paris': 1, 'city': 1, 'love': 1, 'flying': 1, 'crowd': 1}


In [99]:
# proper casing

caseless_freq = {}
propercase_freq = {}

for kw, count in kw_freq.items():
    if kw in caseless_freq:
        caseless_freq[kw.lower()].append(count)
    else:
        caseless_freq[kw.lower()] = [count]

for kw, count in kw_freq.items():
    if count == max(caseless_freq[kw.lower()]):
        propercase_freq[kw] = sum(caseless_freq[kw.lower()])

print(propercase_freq)
        

{'France': 2, 'Paris': 6, 'New York City': 1, 'Bali': 2, 'San Francisco': 2, 'Berlin': 1, 'visit': 3, 'crowded': 1, 'sightseeing': 1, 'swim': 2, 'tropical': 1, 'ocean': 1, 'rain': 1, 'monument': 1, 'enjoy': 1, 'eiffel': 1, 'tower': 1, 'city': 1, 'love': 1, 'flying': 1, 'crowd': 1}


In [100]:
# semantic k means clustering

glove_vectors = []
labels_array = []

for kw, count in propercase_freq.items():
    labels_array.append(kw)
    glove_vectors.append(nlp(kw)[0].vector)

print(np.array(glove_vectors).shape, labels_array)

(21, 300) ['France', 'Paris', 'New York City', 'Bali', 'San Francisco', 'Berlin', 'visit', 'crowded', 'sightseeing', 'swim', 'tropical', 'ocean', 'rain', 'monument', 'enjoy', 'eiffel', 'tower', 'city', 'love', 'flying', 'crowd']


In [101]:
# k means clustering 

kmeans_model = KMeans(init='k-means++', n_clusters=len(labels_array)//5, n_init=5)
kmeans_model.fit(glove_vectors)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=5, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [102]:
cluster_labels    = kmeans_model.labels_
cluster_inertia   = kmeans_model.inertia_

print(cluster_labels)

clusters = {}
for i in range(len(labels_array)):
    if cluster_labels[i] not in clusters:
        clusters[cluster_labels[i]] = [labels_array[i]]
    else:
        clusters[cluster_labels[i]].append(labels_array[i])

print (clusters)

[1 1 0 1 1 1 0 0 0 2 2 2 2 3 0 3 3 0 0 3 0]
{1: ['France', 'Paris', 'Bali', 'San Francisco', 'Berlin'], 0: ['New York City', 'visit', 'crowded', 'sightseeing', 'enjoy', 'city', 'love', 'crowd'], 2: ['swim', 'tropical', 'ocean', 'rain'], 3: ['monument', 'eiffel', 'tower', 'flying']}


In [103]:
#distance matrix (len(cluster_labels)^2)

from scipy import spatial

n = len(labels_array)

similarity_matrix = np.zeros([n, n])

for i in range(n):
    for j in range(n):
        distance_matrix[i][j] = spatial.distance.cosine(glove_vectors[i], glove_vectors[j])


In [116]:
# assign max font size

def assign_font_size(propercase_freq, max_size, min_size):
    label_fs = {}
    sorted_tuples = [(k, propercase_freq[k]) for k in sorted(propercase_freq, key=propercase_freq.get, reverse=True)]
    min_count = sorted_tuples[-1][1]
    max_count = sorted_tuples[0][1]
    
    for kw, count in sorted_tuples:
        size = int((max_size - min_size)/(max_count - min_count)*count + min_size - (max_size - min_size)/(max_count - min_count)*min_count)
        label_fs[kw] = size
    
    return (label_fs)
        
assign_font_size(propercase_freq, 40, 18)

{'Bali': 22,
 'Berlin': 18,
 'France': 22,
 'New York City': 18,
 'Paris': 40,
 'San Francisco': 22,
 'city': 18,
 'crowd': 18,
 'crowded': 18,
 'eiffel': 18,
 'enjoy': 18,
 'flying': 18,
 'love': 18,
 'monument': 18,
 'ocean': 18,
 'rain': 18,
 'sightseeing': 18,
 'swim': 22,
 'tower': 18,
 'tropical': 18,
 'visit': 26}

In [None]:
# 

In [None]:
# Aspect ration of words : font_size (length) font_size*0.7(width)
# Aspect ration of SVG file is 16:9

# How to draw V1 
# Create a polygon with the number of vertices = number of clusters 
# here cluster size = 3 so a triangle (not ever going to exceed 5)
# 3 rectangles to fit within the first rectangle 

# in a 16:9

# Cluster one in rect 1 (y = 16, 9/4) (w: 8, l: 9/2)
# cluster Two in rect 2 (y = 16, 9/4*3) (w: 8, l: 9/2)
# Cluster three in rect 3 (y = 8, 9/4) Biggest cluster ? (w: 8, l: 9/2)

# Where to put the words 
# Start with the highest frequence with the biggest font : assign max font size before starting to draw
# If the next one is smaller in frequence, font size drops by 
# font size 35 to 18
# random choice where the word fits 
