In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import gensim
import os

from networkx.drawing.nx_agraph import graphviz_layout
from chinese_whispers import chinese_whispers, aggregate_clusters
from gensim.models.poincare import PoincareModel
from nltk.corpus import wordnet as wn

### Construct the Networkx graph
From a csv file

In [None]:
def display_taxonomy(graph):
    """ Display the taxonomy in a hierarchical layout """
    pos = graphviz_layout(graph, prog='dot', args="-Grankdir=LR")
    plt.figure(3,figsize=(48,144))
    nx.draw(graph, pos, with_labels=True, arrows=True)
    plt.show()

In [None]:
input_path = '../taxi_output/simple_full/science_en.csv-relations.csv-taxo-knn1.csv'

In [None]:
# Read the taxonomy as a dataframe
df = pd.read_csv(
    '../taxi_output/simple_full/science_en.csv-relations.csv-taxo-knn1.csv',
    sep='\t',
    header=None,
    names=['hyponym', 'hypernym'],
    usecols=[1,2],
)

In [None]:
# Construct the networkx graph
G = nx.DiGraph()
for rel in zip(list(df['hypernym']), list(df['hyponym'])):
    
    rel_0 = rel[0]
    rel_1 = rel[1]
    
    # Simplify the compound words by replacing the whitespaces with underscores
    if ' ' in rel[0]:
        rel_0 = '_'.join(rel[0].split())
    if ' ' in rel[1]:
        rel_1 = '_'.join(rel[1].split())
    G.add_edge(rel_0, rel_1)

## Load Word Vectors in gensim

### If the pre-trained vectors are in '.vec' format, save them in a binary file
This needs to be done only once:  
Load the pre-trained vectors in **'.vec'** format and then save it in **'.bin'**, so that the loading of vectors is done quickly from next time.

In [None]:
def load_vectors_old(path, mode='own', save_binary=False):
    """ Load word vectors.
        Mode Types:
            - 'fast': Load word vectors from pre-trained embeddings in FastText
            - 'own': Load word vectors from own embeddings
        
        To save the loaded vectors in binary format, set 'save_binary' to True
    """
    
    if mode == 'own':
        model = gensim.models.KeyedVectors.load(path)
    else:
        if os.path.splitext(path)[-1] == '.vec':  # for pre-trained vectors in '.vec' format
            model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False, unicode_errors='ignore')
            if save_binary:
                model.save_word2vec_format(os.path.splitext(path)[0] + '.bin', binary=True)
        else:  # for pre-trained vectors in '.bin' format
            model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, unicode_errors='ignore')
        model.init_sims(replace=True)
    
    return model

In [None]:
def load_vectors(embedding):
    """ Load word vectors. """

    embedding_dir = '/home/5aly/taxi/distributed_semantics/embeddings/'

    if embedding == "wiki2M":
        model = gensim.models.KeyedVectors.load_word2vec_format(embedding_dir + 'crawl-300d-2M.vec', binary=False)
    elif embedding == "wiki1M_subword":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            embedding_dir + 'wiki-news-300d-1M-subword.vec', binary=False
        )
    elif embedding == "own_w2v":
        model = gensim.models.KeyedVectors.load(embedding_dir + 'own_embeddings_w2v')
    elif embedding == "poincare":
        model = PoincareModel.load(embedding_dir + 'embeddings_poincare_wordnet')

    return model

In [None]:
embeddings = 'poincare'
w2v = load_vectors(embeddings)

# Improving Taxonomy with Distributional Semantics

Create a networkx graph for each node containing only its children. Draw edges among the children based on the similarity with one another using word vectors.

In [None]:
def create_children_clusters(w2v_model, graph, embedding, depth=100):
    """ This function returns a dictionary where corresponding to each key(node) is a graph of its children """
    clustered_graph = {}
    for node in graph.nodes():
        clustered_graph[node] = nx.Graph()
        successors = [s.lower() for s in graph.successors(node)]

        for successor in successors:
            word_in_vocab = False
            if embedding == "poincare":
                word_senses = wn.synsets(successor)  # Get all the senses of the given node
                for sense in word_senses:
                    try:
                        for word, score in w2v_model.kv.most_similar(sense.name(), topn=depth):
                            word = word.split('.')[0]  # convert the word from poincare format to normal string
                            word_in_vocab = True
                            if word.lower() in successors:
                                clustered_graph[node].add_edge(successor, word.lower())
                    except KeyError:
                        continue
            else:
                try:
                    for word, score in w2v_model.most_similar(successor, topn=depth):
                        word_in_vocab = True
                        if word.lower() in successors:
                            clustered_graph[node].add_edge(successor, word.lower())
                except KeyError:
                    pass
            
            if not word_in_vocab:  # If the word in not in vocabulary, check using the substring based method
                successor_terms = successor.split('_')
                root_terms = [successor_terms[0], successor_terms[-1]]
                if node in root_terms:
                    clustered_graph[node].add_node(successor)
    
    return clustered_graph

In [None]:
GC = create_children_clusters(w2v, G, embeddings)

In [None]:
posI = graphviz_layout(GC['engineering'])
# plt.figure(2, figsize=(20, 20))
nx.draw(GC['engineering'], posI, with_labels=True, arrows=True)
plt.show()

## Implementing Chinese Whispers Algorithm

### Removal of smaller clusters
- For every node, cluster its children.
- Keep only the biggest cluster and detach the rest from the graph.  
- Store the removed clusters in a list.

In [None]:
def remove_clusters(model, nx_graph, embedding, depth=100):
    """ Removes the less related and small clusters from the graph """

    print('Removing small clusters..')
    g_clustered = create_children_clusters(model, nx_graph, embedding, depth)
    removed_clusters = []

    for node, graph in g_clustered.items():
        gc = chinese_whispers(graph, weighting='top', iterations=60)
        try:
            max_cluster_size = len(max(aggregate_clusters(gc).values(), key=len))
        except ValueError:
            continue

        clusters, size_ratio = [], []
        for label, cluster in aggregate_clusters(gc).items():
            clusters.append(cluster)
            size_ratio.append(len(cluster) / max_cluster_size)

        sorted_clusters = [cluster for _, cluster in sorted(zip(size_ratio, clusters))]
        if len(sorted_clusters) > 10:
            sorted_clusters = sorted_clusters[:10]

        for cluster in sorted_clusters:  # detach smallest 10 clusters
            removed_clusters.append(cluster)
            for item in cluster:
                nx_graph.remove_edge(node, item)

    return nx_graph, removed_clusters

In [None]:
G_improved = G.copy()
G_improved, removed_clusters = remove_clusters(w2v, G_improved, embeddings)

In [None]:
len(removed_clusters)

### Adding back the removed clusters
- Loop through all the removed clusters.
- For each removed cluster, find out the cluster in the graph that has the maximum similarity with it.

Similarity between two clusters is computed by calculating the average of the pairwise similarity of the elements of both the clusters i.e. NxM

In [None]:
def calculate_similarity(w2v_model, parent, family, cluster, embedding):
    
    # Similarity between the parent and a cluster
    parent_similarity = 0
    for item in cluster:
        if embedding == "poincare":
            item_senses = wn.synsets(item)
            parent_senses = wn.synsets(parent)
            for parent_sense in parent_senses:
                for item_sense in item_senses:
                    try:
                        parent_similarity += w2v_model.kv.similarity(parent_sense, item_sense.name())
                    except KeyError as e:
                        if parent_sense in str(e):
                            break
                        else:
                            continue
        else:
            try:
                parent_similarity += w2v_model.similarity(parent, item)
            except KeyError:  # skip the terms not in vocabulary
                continue
    parent_similarity /= len(cluster)
    
    # Similarity between a family and a cluster
    family_similarity = 0
    for f_item in family:
        for c_item in cluster:
            if embedding == "poincare":
                f_senses = wn.synsets(f_item)
                c_senses = wn.synsets(c_item)
                for f_sense in f_senses:
                    for c_sense in c_senses:
                        try:
                            family_similarity += w2v_model.kv.similarity(f_sense, c_sense)
                        except KeyError as e:
                            if f_sense in str(e):
                                break
                            else:
                                continue
            else:
                try:
                    family_similarity += w2v_model.similarity(f_item, c_item)
                except KeyError:  # skip the terms not in vocabulary
                    continue
    family_similarity /= (len(family) * len(cluster))
    
    # Final score is the average of both the similarities
    return (parent_similarity + family_similarity) / 2

In [None]:
GC_detached = create_children_clusters(w2v, G_improved, embeddings)

In [None]:
for cluster in removed_clusters:
    max_score = 0
    max_score_node = ''
    for node, graph in GC_detached.items():
        gc = chinese_whispers(graph, weighting='top', iterations=60)
        for label, family in aggregate_clusters(gc).items():
            score = calculate_similarity(w2v, node, family, cluster, embeddings)
            if score > max_score:
                max_score = score
                max_score_node = node
    for item in cluster:
        G_improved.add_edge(max_score_node, item)

### Tuning the nodes and the edges

In [None]:
if '' in G_improved.nodes():
    G_improved.remove_node('')

In [None]:
hypernyms = {x[0] for x in G_improved.edges()}
isolated_nodes = list(nx.isolates(G_improved))

for isolated_node in isolated_nodes:
    terms = isolated_node.split('_')
    if terms[-1] in hypernyms:
        G_improved.add_edge(terms[-1], isolated_node)
    elif terms[0] in hypernyms:
        G_improved.add_edge(terms[0], isolated_node)
    else:
        G_improved.remove_node(isolated_node)

## Results visualization

### Clusters

In [None]:
def visualize_clusters(graph):
    """ Clusterize the nodes of a particular domain in a given graph """
    graph_cluster = chinese_whispers(graph, weighting='top', iterations=60)
    
    # Visualize the clustering of graph_cluster using NetworkX (requires matplotlib)
    colors = [1. / graph_cluster.node[node]['label'] for node in graph_cluster.nodes()]
    fig = plt.gcf()
    fig.set_size_inches(20, 20)
    nx.draw_networkx(graph_cluster, cmap=plt.get_cmap('jet'), node_color=colors, font_color='black')
    plt.show()

In [None]:
GC_improved = create_children_clusters(w2v, G_improved, embeddings)

In [None]:
domain = 'engineering'

In [None]:
# Original clusters
visualize_clusters(GC[domain])

In [None]:
# Clusters after detaching
visualize_clusters(GC_detached[domain])

In [None]:
# Clusters after detaching and re-attaching the clusters
visualize_clusters(GC_improved[domain])

### Taxonomy

In [None]:
# View the original taxonomy
display_taxonomy(G)

In [None]:
# View the modified taxonomy
display_taxonomy(G_improved)

## Save the result

In [None]:
df_improved = pd.DataFrame(list(G_improved.edges()), columns=['hypernym', 'hyponym'])
df_improved = df_improved[df_improved.columns.tolist()[::-1]]

# Replace the underscores with blanks
df_improved['hyponym'] = df_improved['hyponym'].apply(lambda x: x.replace('_', ' '))
df_improved['hypernym'] = df_improved['hypernym'].apply(lambda x: x.replace('_', ' '))

In [None]:
file_path = os.path.splitext(input_path)
output_path = '../taxi_output/distributional_semantics/' + file_path[0].split('/')[-1] + '-semantic-poincare' + file_path[1]
df_improved.to_csv(output_path, sep='\t', header=False)