In [1]:
import spacy

nlp = spacy.load('en_core_web_md')


In [2]:
import numpy as np

from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation


In [3]:
import random

from math import cos
from math import sin


In [71]:
docs = [x.strip('\n') for x in open('data/real6.txt').readlines()]

In [72]:
# Remove stop words

def stop_word_stripper(line):
    stop_words = [w.strip('\n').lower() for w in open('data/stop_words.txt').readlines()]
    pos_stopper = ['PUNCT', 'SYM']
    return ' '.join([token.text for token in line if str(token).lower() not in stop_words and token.pos_  not in pos_stopper])

stripped_docs = [] #spacy object
copy_docs = [] # strings
for d in docs:
    stripped_docs.append(nlp(stop_word_stripper(nlp(d))))
    copy_docs.append(stop_word_stripper(nlp(d)))
    
print('stripped docs', stripped_docs)

stripped docs [Value, , Feature Set Services, , Value, Service, Red Color, Msp, Protection, Support, Support, Value, Development, Total Security, Total security, Simple, Value, Useful, Performance, Value, Innovation, Intigration, Bundle, Total Security, Support, Totalsecurity, Integration, Simple, Features, Value, Value, Integration]


In [73]:
# parse through to get entities 
kw_freq = {}

for i in range(len(stripped_docs)):
    line = stripped_docs[i]
    for e in line.ents:
        copy_docs[i] = copy_docs[i].replace(e.text, '').strip()
        if e.text in kw_freq:
            kw_freq[e.text] += 1
        else:
            kw_freq[e.text] = 1

print(kw_freq, copy_docs)

{'Feature Set Services': 1, 'Red Color': 1, 'Msp': 1} ['Value', '', '', '', 'Value', 'Service', '', '', 'Protection', 'Support', 'Support', 'Value', 'Development', 'Total Security', 'Total security', 'Simple', 'Value', 'Useful', 'Performance', 'Value', 'Innovation', 'Intigration', 'Bundle', 'Total Security', 'Support', 'Totalsecurity', 'Integration', 'Simple', 'Features', 'Value', 'Value', 'Integration']


In [74]:
# get lemma keywords 
# join the rest of the words together: 
from hunspell import Hunspell
h = Hunspell();

corpus = nlp(' '.join(copy_docs))

MIN_CHARACTERS = 3

for token in corpus:
    if len(token.lemma_) >= MIN_CHARACTERS:
        word = token.lemma_
        if word.lower == word:
            if not h.spell(token.lemma_):
                if len(h.suggest(token.lemma_)) > 0:
                    word = h.suggest(token.lemma_)[0]
        if word in kw_freq:
            kw_freq[word] += 1
        else:
            kw_freq[word] = 1

print(kw_freq)

{'Feature Set Services': 1, 'Red Color': 1, 'Msp': 1, 'value': 7, '   ': 1, 'service': 1, 'protection': 1, 'support': 3, 'development': 1, 'total': 3, 'security': 3, 'simple': 2, 'useful': 1, 'performance': 1, 'innovation': 1, 'intigration': 1, 'bundle': 1, 'totalsecurity': 1, 'integration': 2, 'features': 1}


In [75]:
# proper casing

caseless_freq = {}
propercase_freq = {}

for kw, count in kw_freq.items():
    if kw in caseless_freq:
        caseless_freq[kw.lower()].append(count)
    else:
        caseless_freq[kw.lower()] = [count]

for kw, count in kw_freq.items():
    if count == max(caseless_freq[kw.lower()]):
        propercase_freq[kw] = sum(caseless_freq[kw.lower()])

print(propercase_freq)
        

{'Feature Set Services': 1, 'Red Color': 1, 'Msp': 1, 'value': 7, '   ': 1, 'service': 1, 'protection': 1, 'support': 3, 'development': 1, 'total': 3, 'security': 3, 'simple': 2, 'useful': 1, 'performance': 1, 'innovation': 1, 'intigration': 1, 'bundle': 1, 'totalsecurity': 1, 'integration': 2, 'features': 1}


In [76]:
glove_vectors = []
labels_array = []

for kw, count in propercase_freq.items():
    labels_array.append(kw)
    if nlp(kw)[0].vector.any() :
        glove_vectors.append(nlp(kw)[0].vector)
    else:
        glove_vectors.append(np.array([0]*300))
print(np.array(glove_vectors).shape, labels_array)

(20, 300) ['Feature Set Services', 'Red Color', 'Msp', 'value', '   ', 'service', 'protection', 'support', 'development', 'total', 'security', 'simple', 'useful', 'performance', 'innovation', 'intigration', 'bundle', 'totalsecurity', 'integration', 'features']


In [77]:
# AffinityPropagation clustering 

AffinityPropagation_model = AffinityPropagation()
AffinityPropagation_model.fit(glove_vectors)

cluster_labels    = AffinityPropagation_model.labels_

clusters = {}
kw_cluster = {}
for i in range(len(labels_array)):
    if cluster_labels[i] not in clusters:
        clusters[cluster_labels[i]] = [labels_array[i]]
    else:
        clusters[cluster_labels[i]].append(labels_array[i])
    kw_cluster[labels_array[i]] = cluster_labels[i]

print (kw_cluster)

{'Feature Set Services': 4, 'Red Color': 1, 'Msp': 0, 'value': 1, '   ': 1, 'service': 1, 'protection': 2, 'support': 1, 'development': 3, 'total': 1, 'security': 2, 'simple': 1, 'useful': 1, 'performance': 1, 'innovation': 3, 'intigration': 1, 'bundle': 1, 'totalsecurity': 1, 'integration': 3, 'features': 4}


In [78]:
'''
# k means clustering 

kmeans_model = KMeans(init='k-means++', n_clusters=4, n_init=5)
kmeans_model.fit(glove_vectors)


cluster_labels    = kmeans_model.labels_

clusters = {}
kw_cluster = {}
for i in range(len(labels_array)):
    if cluster_labels[i] not in clusters:
        clusters[cluster_labels[i]] = [labels_array[i]]
    else:
        clusters[cluster_labels[i]].append(labels_array[i])
    kw_cluster[labels_array[i]] = cluster_labels[i]

print (kw_cluster)
'''


"\n# k means clustering \n\nkmeans_model = KMeans(init='k-means++', n_clusters=4, n_init=5)\nkmeans_model.fit(glove_vectors)\n\n\ncluster_labels    = kmeans_model.labels_\n\nclusters = {}\nkw_cluster = {}\nfor i in range(len(labels_array)):\n    if cluster_labels[i] not in clusters:\n        clusters[cluster_labels[i]] = [labels_array[i]]\n    else:\n        clusters[cluster_labels[i]].append(labels_array[i])\n    kw_cluster[labels_array[i]] = cluster_labels[i]\n\nprint (kw_cluster)\n"

In [79]:
#distance matrix (len(cluster_labels)^2)

from scipy import spatial

n = len(labels_array)

distance_matrix = np.zeros([n, n])

for i in range(n):
    for j in range(n):
        distance_matrix[i][j] = spatial.distance.cosine(glove_vectors[i], glove_vectors[j])


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [80]:
# assign max font size

def assign_font_size(propercase_freq, max_size, min_size):
    label_fs = {}
    sorted_tuples = [(k, propercase_freq[k]) for k in sorted(propercase_freq, key=propercase_freq.get, reverse=True)]
    min_count = sorted_tuples[-1][1]
    max_count = sorted_tuples[0][1]
    
    for kw, count in sorted_tuples:
        if (max_count - min_count) == 0:
            size = int((max_size - min_size) / 2.0 + min_size)
        else:
            #size = int(min_size + (max_size - min_size) * (count * 1.0 / (max_count - min_count)) ** 0.8)
            size = int((max_size - min_size)/(max_count - min_count)*count + min_size - (max_size - min_size)/(max_count - min_count)*min_count)
        label_fs[kw] = size
    
    return (label_fs)
        
kw_fs = assign_font_size(propercase_freq, 80, 30) #keyword_font_size
print(kw_fs)

{'value': 80, 'support': 46, 'total': 46, 'security': 46, 'simple': 38, 'integration': 38, 'Feature Set Services': 30, 'Red Color': 30, 'Msp': 30, '   ': 30, 'service': 30, 'protection': 30, 'development': 30, 'useful': 30, 'performance': 30, 'innovation': 30, 'intigration': 30, 'bundle': 30, 'totalsecurity': 30, 'features': 30}


In [81]:
def max_dimensions(kw_fs):
    kw_dimensions = {}
    for kw, fs in kw_fs.items():
        kw_dimensions[kw] = (int(0.65*len(kw)*fs), fs) #x, y (i.e. width, height)
    return kw_dimensions

kw_max_dim = max_dimensions(kw_fs)
print(kw_max_dim)

{'value': (260, 80), 'support': (209, 46), 'total': (149, 46), 'security': (239, 46), 'simple': (148, 38), 'integration': (271, 38), 'Feature Set Services': (390, 30), 'Red Color': (175, 30), 'Msp': (58, 30), '   ': (58, 30), 'service': (136, 30), 'protection': (195, 30), 'development': (214, 30), 'useful': (117, 30), 'performance': (214, 30), 'innovation': (195, 30), 'intigration': (214, 30), 'bundle': (117, 30), 'totalsecurity': (253, 30), 'features': (156, 30)}


In [82]:
class Word:
    def __init__(self, word, size, font_size, cluster):
        self.word = word
        self.width = size["width"] #{width, height}
        self.height = size["height"]
        self.font_size = font_size
        self.cluster = cluster

In [85]:
class Cloud:
    def __init__(self, words=[], canvas_size={"x": 1920, "y": 1080}, filename='clouds.html'):
        self.words = words
        self.canvas = [] #{word, font_size, x, y, width, height, color, cluster} <== color to be added
        self.canvas_size = canvas_size
        self.clusters = self.generate_clusters() # {0 : cluster0, 1 : cluster1, ...etc}
        self.filename = filename
        self.colors = ["#6F694E", "#65D0B2", "#D8F546", "#FF724B", "#D6523E", "#B3F0E6", "#EAF380", "#A7328E", "#33DB45", "#EAEA45", "#63FFF3", "#7488AC", "#C0F8E1"]
        self.positions = []
        
    def generate_clusters(self):
        clusters = {}
        for w in self.words:
            if w.cluster in clusters:
                clusters[w.cluster].append(w)
            else:
                clusters[w.cluster] = [w]
        return clusters
    
    '''
    def choose_cluster_start(self):
        start_points = {}
        start_point = {}
        r = 0
        for i in range(len(self.clusters)):
            c = self.clusters[i]
            n = len(c)
            
            H = self.canvas_size["y"] #total height
            L = self.canvas_size["x"] #total length
            
            if i%2 == 0:
                y = random.randint(int(0.1*H), int(0.55*H))
            else:
                y = random.randint(int(0.55*H), int(0.9*H))
            x = random.randint(int(r*L), min(int((r+len(c)/len(self.words))*L), int(L*0.90)))
            
            r = min(0.85, r + len(c)/len(self.words))
            start_points[c[0].cluster] = {
                "x": x,
                "y": y
            }
        return start_points
    '''
        
    def create_cloud(self):
        
        # sort by cluster size
        # sort by max font-size
        cl_size = {}
        for c, words in self.clusters.items():
            if len(words) < 4:
                avg_size = sum([w.font_size for w in words])//len(words)
            else:
                avg_size = sum(sorted([w.font_size for w in words])[::-1][:4])/4
            cl_size[c] = avg_size*3 - len(words)
        sorted_clusters = sorted(cl_size, key=cl_size.get)[::-1]
        
        start_position = { "x": self.canvas_size["x"]//2, "y": self.canvas_size["y"]//2 }
        
        for i in range(len(sorted_clusters)):
            c = sorted_clusters[i]
            words = self.clusters[c]
            self.positions = self.spiral(start_position)
            
            for w in words:
                new_position = self.add_word_to_cloud(w) 
            
            max_left_cloud = min([c["x"] for c in self.canvas])
            max_right_cloud = max([c["x"] for c in self.canvas])
            shift = 30
            if i%2 == 0:
                if new_position["x"] < self.canvas_size["x"]//2: 
                    start_position = { "x" : min(self.canvas_size["x"]//2 + new_position["x"], max_right_cloud + shift), "y": new_position["y"] }
                if new_position["x"] > self.canvas_size["x"]//2: 
                    start_position = { "x" : max(self.canvas_size["x"] - new_position["x"], max_left_cloud - shift), "y": new_position["y"] }
            #else:
            start_position = new_position
        
        self.center_cloud()
        
    def draw_cloud_to_svg(self):
        f = open(self.filename, 'w')
        f.write('<svg viewbox="0 0 {} {}" style="background: black">'.format(self.canvas_size["x"], self.canvas_size["y"]))
        for w in self.canvas:
           

            #f.write(' <rect x="{}" y="{}" width="{}" height="{}"/>'.format( w["x"], w["y"], w["width"], w["height"]))
            f.write('<text x="{}" y="{}" font-family="Verdana" font-size="{}" fill="{}">'.format(w["x"], w["y"], w["font_size"], w["color"]))
            f.write(w["word"])
            f.write('</text>\n')
        f.write('</svg>')
        f.close()
        
        
    def add_word_to_cloud(self, word): # word class Word
        center = {"x": self.canvas_size["x"] // 2, "y": self.canvas_size["y"] // 2}
        for p in self.positions:
            if p["x"] < center["x"]:
                if not self.verify_overlap( word, {"x": p["x"] - word.width, "y": p["y"]} ):
                    self.canvas.append({
                        "word": word.word,
                        "x": p["x"] - word.width,
                        "y": p["y"],
                        "width": word.width,
                        "height": word.height,
                        "font_size": word.font_size,
                        "color": self.colors[word.cluster],
                        "cluster": word.cluster
                    })
                    self.positions.remove(p)
                    return p
            else:
                if not self.verify_overlap( word, {"x": p["x"], "y": p["y"]} ):
                    self.canvas.append({
                        "word": word.word,
                        "x": p["x"],
                        "y": p["y"],
                        "width": word.width,
                        "height": word.height,
                        "font_size": word.font_size,
                        "color": self.colors[word.cluster],
                        "cluster": word.cluster
                    })
                    self.positions.remove(p)
                    return p

        return self.positions[-1]
            

    def rect_intersection(self, r1, r2):
        p1 = {}
        p1["x"] = r1["x"]
        p1["y"] = r1["y"] - r1["height"]

        p2 = {}
        p2["x"] = r1["x"] + r1["width"]
        p2["y"] = r1["y"]

        p3 = {}
        p3["x"] = r2["x"]
        p3["y"] = r2["y"] - r2["height"]

        p4 = {}
        p4["x"] = r2["x"] + r2["width"]
        p4["y"] = r2["y"]

        return not(p2["y"] < p3["y"] or p1["y"] > p4["y"] or p2["x"] < p3["x"] or p1["x"] > p4["x"])

    
    def verify_overlap(self, word, position): # true if overlaps, false if not
        new_rect = {
            "x": position["x"],
            "y": position["y"],
            "width": word.width,
            "height": word.height
        }
        for filled_rect in self.canvas:
            if self.rect_intersection(filled_rect, new_rect):
                return True
        #verify out of bound of rectangle:
        if new_rect["x"] < 0 or new_rect["x"] + new_rect["width"] > self.canvas_size["x"] or new_rect["y"] > 1080 or new_rect["y"]- new_rect["height"] < 0:
            return True
        return False
    

    def spiral(self, start_point): # returns an [] with positions to test 
        points = [start_point]
        # x = (a + b*theta)cos(theta)
        # y = (a + b*theta)sin(theta)

        # b = a final - a ini / 2 pi n  n=number of turns
        a_ini = 0
        # a_final = self.canvas_size["x"]*len(self.clusters[cluster])/len(self.words) #spiral radius 
        a_final = self.canvas_size["x"] #spiral radius 

        b = (a_final - a_ini)/(2*3.14159*(self.canvas_size["x"]/70))

        thetas = [ (self.canvas_size["y"]/10 * 2)/1000 *x for x in range(1000)]
        for i in thetas: #1000 points
            x = ( a_ini + b*i + cos(i)*b/10)*cos(i) + start_point["x"]
            y = ( a_ini + b*i + cos(i)*b/10)*sin(i) + start_point["y"]
            points.append({"x": x, "y": y})

        return points
    
    def center_cloud(self):
        xs = [c["x"] for c in self.canvas]
        ys = [c["y"] for c in self.canvas]
        
        x_min = min(xs)
        x_max = max(xs) # ! not real max, real max need word width
        
        y_min = min(ys)
        y_max = max(ys)
        
        shift_x = x_min - (self.canvas_size["x"] - (x_max - x_min))//2
        shift_y = y_min - (self.canvas_size["y"] - (y_max - y_min))//2
        
        for c in self.canvas:
            c["x"] -= shift_x
            c["y"] -= shift_y
        
        
    '''
    def compress(self):
        # pull words towards the one zith the most occurence
        # create line 
        # test positions along that line 
        sizes = []
        for w in self.canvas:
            sizes.append(w["font_size"])
        central_word = self.canvas[sizes.index(max(sizes))]
        
        for w in self.canvas:
            if w["cluster"] != central_word["cluster"]:
                # sort tham by distance 
                pos_central_word = np.array([central_word["x"], central_word["y"]])
                pos_w = np.array([w["x"], w["y"]])
                dist = numpy.sqrt(numpy.sum((pos_central_word - pos_w)**2))
                
                # draw line 
                # inch closer 
                coeff = central_word["y"] - w["y"] / central_word["x"] - w["x"]
                coordiantes = [{"x": central_word["x"] + (central_word["x"] - w["x"])/100 * i, "y": central_word["y"] + coeff* (central_word["x"] - w["x"])/100 * i } for i in range(100)]
                for c in coordinates:
                    for word in self.words:
                        if self.verify_overlap(word, c):
                            break
    '''             

In [86]:
words = []
for kw, d in kw_max_dim.items():
    words.append(Word(kw, {"width": d[0], "height": d[1]}, kw_fs[kw], kw_cluster[kw]))

cloud = Cloud(words=words)

cloud.create_cloud()

cloud.draw_cloud_to_svg()
#cloud.compress()

#print(cloud.canvas)
'''
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111)
fig.subplots_adjust(top=0.85)

for w in cloud.canvas:
    ax.text(w["x"], w["y"], w["word"], fontsize=w["font_size"]//3)

ax.axis([0, 1920, 0, 1080])

'''
#plt.show()


'\nimport matplotlib.pyplot as plt\n\nfig = plt.figure(figsize=(10,10))\n\nax = fig.add_subplot(111)\nfig.subplots_adjust(top=0.85)\n\nfor w in cloud.canvas:\n    ax.text(w["x"], w["y"], w["word"], fontsize=w["font_size"]//3)\n\nax.axis([0, 1920, 0, 1080])\n\n'

In [37]:
class Vertex:
    def __init__(self, node):
        self.id = node # we have a dict {id : { word, weight } }
        self.adjacent = {}

    def __str__(self):
        return str(self.id) + ' adjacent: ' + str([x.id for x in self.adjacent])

    def add_neighbor(self, neighbor, weight=0):
        self.adjacent[neighbor] = weight

    def get_connections(self):
        return self.adjacent.keys()  

    def get_id(self):
        return self.id

    def get_weight(self, neighbor):
        return self.adjacent[neighbor]


In [38]:
class Graph:
    def __init__(self):
        self.vert_dict = {}
        self.num_vertices = 0

    def __iter__(self):
        return iter(self.vert_dict.values())

    def add_vertex(self, node):
        self.num_vertices = self.num_vertices + 1
        new_vertex = Vertex(node)
        self.vert_dict[node] = new_vertex
        return new_vertex

    def get_vertex(self, n):
        if n in self.vert_dict:
            return self.vert_dict[n]
        else:
            return None

    def add_edge(self, frm, to, cost = 0):
        if frm not in self.vert_dict:
            self.add_vertex(frm)
        if to not in self.vert_dict:
            self.add_vertex(to)

        self.vert_dict[frm].add_neighbor(self.vert_dict[to], cost)
        self.vert_dict[to].add_neighbor(self.vert_dict[frm], cost)

    def get_vertices(self):
        return self.vert_dict.keys()
    

In [312]:
'''
Star Forest Clustering and putting together
'''

def StarForestAlgo(g):
    '''
    g = similarity graph
    '''
    stars = []
    while True:
        usedVertices = []
        bestCenter = findBestCenter(g, usedVertices)

        if bestCenter is None:
            break; 
        
        star, usedVertices = createGraphStar(g, bestCenter, usedVertices) # graph, vertice, [vertices]
        print(usedVertices)
        stars.append(star)
        
        
    return stars


def findBestCenter(g, usedVertices): # graph, [vertices]
    best_sum = 0
    best_center = None
    for v in g.get_vertices():
        if v not in usedVertices:
            sum = getSumOfConnectedEdges(g, v, usedVertices)
            if sum > best_sum:
                best_center = v
    return best_center


def getSumOfConnectedEdges(g, v, usedVertices):
    sum = 0
    connections = g.get_vertex(v).get_connections()
    for c in connections:
        if c not in usedVertices:
            sum += g.get_vertex(v).get_weight(c)
    return sum
    

def createGraphStar(g, bestCenter, usedVertices):
    star = Graph()
    for v in g.get_vertex(bestCenter).get_connections():
        if v not in usedVertices and g.get_vertex(bestCenter) != v:
            star.add_edge(bestCenter, v, g.get_vertex(bestCenter).get_weight(v))
            print(v)
            usedVertices.append(v)
    return star, usedVertices



g = Graph()

g.add_vertex('a')
g.add_vertex('b')
g.add_vertex('c')
g.add_vertex('d')
g.add_vertex('e')
g.add_vertex('f')

g.add_edge('a', 'b', 7)  
g.add_edge('a', 'c', 9)
g.add_edge('a', 'f', 14)
g.add_edge('b', 'c', 10)
g.add_edge('b', 'd', 15)
g.add_edge('c', 'd', 11)
g.add_edge('c', 'f', 2)
g.add_edge('d', 'e', 6)
g.add_edge('e', 'f', 9)

#StarForestAlgo(g)


In [None]:
# Aspect ration of words : font_size (length) font_size*0.7(width)
# Aspect ration of SVG file is 16:9

# How to draw V1 
# Create a polygon with the number of vertices = number of clusters 
# here cluster size = 3 so a triangle (not ever going to exceed 5)
# 3 rectangles to fit within the first rectangle 

# in a 16:9

# Cluster one in rect 1 (y = 16, 9/4) (w: 8, l: 9/2)
# cluster Two in rect 2 (y = 16, 9/4*3) (w: 8, l: 9/2)
# Cluster three in rect 3 (y = 8, 9/4) Biggest cluster ? (w: 8, l: 9/2)

# Where to put the words 
# Start with the highest frequence with the biggest font : assign max font size before starting to draw
# If the next one is smaller in frequence, font size drops by 
# font size 35 to 18
# random choice where the word fits 

In [None]:
'''
    def seam_carving(self):
        board = self.make_board()
        sparse = 0
        while sparse < 500:
            print(sparse)
            sparse += 1
            self.find_and_remove_path(board)
            board = self.make_board()
            
    def find_and_remove_path(self, board):
        v_path = []
        h_path = []
        sparse = True
        for y in range(self.canvas_size["y"]):
            pt = {}
            pt["y"] = y
            pt["x"] = np.argmin(board[y])
            v_path.append(pt) # first step 
            
        for x in range(self.canvas_size["x"]):
            pt = {}
            pt["x"] = x
            pt["y"] = np.argmin(board[:,x])
            h_path.append(pt) # first step 
        
        for p in v_path + h_path:
            for w in self.canvas:
                if w["x"] > p["x"]:
                    w["x"]-= 1
                if w["y"] > p["y"]:
                    w["y"] -= 1
                    
        board = self.make_board()
        return sparse

    
    def make_board(self):
        cv = self.canvas
        # map canvas to a 1920 1080 matrix 
        board = np.zeros(shape=(1080, 1920)) #lines, columns
        for w in cv:
            for i in range(int(w["y"]) - int(w["height"]), int(w["y"])+1):
                board[i][int(w["x"]) : int(w["x"]) + int(w["width"]) +1 ] = 1
        return board
'''