In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import gensim
import os

from networkx.drawing.nx_agraph import write_dot, graphviz_layout
from chinese_whispers import chinese_whispers, aggregate_clusters

### Construct the Networkx graph
From a csv file

In [2]:
def display_taxonomy(graph):
    """ Display the taxonomy in a hierarchical layout """
    pos = graphviz_layout(G, prog='dot', args="-Grankdir=LR")
    plt.figure(3,figsize=(48,144))
    nx.draw(G, pos, with_labels=True, arrows=True)
    plt.show()

In [3]:
input_path = '../taxi_output/simple_full/science_en.csv-relations.csv-taxo-knn1.csv'

In [4]:
# Read the taxonomy as a dataframe
df = pd.read_csv(
    '../taxi_output/simple_full/science_en.csv-relations.csv-taxo-knn1.csv',
    sep='\t',
    header=None,
    names=['hyponym', 'hypernym'],
    usecols=[1,2],
)

In [5]:
# Construct the networkx graph
G = nx.DiGraph()
for rel in zip(list(df['hypernym']), list(df['hyponym'])):
    
    rel_0 = rel[0]
    rel_1 = rel[1]
    
    # Simplify the compound words by replacing the whitespaces with underscores
    if ' ' in rel[0]:
        rel_0 = '_'.join(rel[0].split())
    if ' ' in rel[1]:
        rel_1 = '_'.join(rel[1].split())
    G.add_edge(rel_0, rel_1)

## Load Word Vectors in gensim

### If the pre-trained vectors are in '.vec' format, save them in a binary file
This needs to be done only once:  
Load the pre-trained vectors in **'.vec'** format and then save it in **'.bin'**, so that the loading of vectors is done quickly from next time.

In [6]:
def load_vectors(path, mode='own', save_binary=False):
    """ Load word vectors.
        Mode Types:
            - 'fast': Load word vectors from pre-trained embeddings in FastText
            - 'own': Load word vectors from own embeddings
        
        To save the loaded vectors in binary format, set 'save_binary' to True
    """
    
    if mode == 'own':
        model = gensim.models.KeyedVectors.load(path)
    else:
        if os.path.splitext(path)[-1] == '.vec':  # for pre-trained vectors in '.vec' format
            model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False, unicode_errors='ignore')
            if save_binary:
                w2v.save_word2vec_format(os.path.splitext(path)[0] + '.bin', binary=True)
        else:  # for pre-trained vectors in '.bin' format
            model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, unicode_errors='ignore')
        model.init_sims(replace=True)
    
    return model

In [7]:
w2v = load_vectors('embeddings/own_embeddings_w2v')

# Improving Taxonomy with Distributional Semantics

Create a networkx graph for each node containing only its children. Draw edges among the children based on the similarity with one another using word vectors.

In [8]:
def create_children_clusters(graph):
    """ This function returns a dictionary where corresponding to each key(node) is a graph of its children """
    clustered_graph = {}
    for node in graph.nodes():
        clustered_graph[node] = nx.Graph()
        successors = [s.lower() for s in graph.successors(node)]

        for successor in successors:
            try:
                for word, score in w2v.most_similar(successor):
                    if word.lower() in successors:
                        clustered_graph[node].add_edge(successor, word.lower())
            except KeyError as e:
                successor_terms = successor.split('_')
                root_terms = [successor_terms[0], successor_terms[-1]]
                if node in root_terms:
                    clustered_graph[node].add_node(successor)
    
    return clustered_graph

In [18]:
GC = create_children_clusters(G)

  # Remove the CWD from sys.path while we load stuff.
  if np.issubdtype(vec.dtype, np.int):


In [None]:
posI = graphviz_layout(GC['engineering'])
# plt.figure(2, figsize=(20, 20))
nx.draw(GC['engineering'], posI, with_labels=True, arrows=True)
plt.show()

## Implementing Chinese Whispers Algorithm

### Removal of smaller clusters
- For every node, cluster its children.
- Keep only the biggest cluster and detach the rest from the graph.  
- Store the removed clusters in a list.

In [19]:
G_improved = G.copy()
removed_clusters = []

for node, graph in GC.items():
    gc = chinese_whispers(graph, weighting='top', iterations=60)
    try:
        max_cluster_size = len(max(aggregate_clusters(gc).values(), key=len))
    except ValueError as e:
        continue
    for label, cluster in aggregate_clusters(gc).items():  # detach all the clusters smaller than the maximum
        if len(cluster) < max_cluster_size:
            removed_clusters.append(cluster)
            for item in cluster:
                G_improved.remove_edge(node, item)

In [None]:
len(removed_clusters)

### Adding back the removed clusters
- Loop through all the removed clusters.
- For each removed cluster, find out the cluster in the graph that has the maximum similarity with it.

Similarity between two clusters is computed by calculating the average of the pairwise similarity of the elements of both the clusters i.e. NxM

In [11]:
def calculate_cluster_similarity(cluster_1, cluster_2):
    scores = []
    for item_1 in cluster_1:
        for item_2 in cluster_2:
            try:
                scores.append(w2v.similarity(item_1, item_2))
            except KeyError as e:  # skip the terms not in vocabulary
                continue
    if len(scores) <= 0:
        return 0
    return sum(scores) / len(scores)

In [12]:
GC_detached = create_children_clusters(G_improved)

  # Remove the CWD from sys.path while we load stuff.
  if np.issubdtype(vec.dtype, np.int):


In [None]:
for cluster in removed_clusters:
    max_score = 0
    max_score_node = ''
    for node, graph in GC_detached.items():
        gc = chinese_whispers(graph, weighting='top', iterations=60)
        for label, cluster_new in aggregate_clusters(gc).items():
            score = calculate_cluster_similarity(cluster, cluster_new)
            if score > max_score:
                max_score = score
                max_score_node = node
    for item in cluster:
        G_improved.add_edge(max_score_node, item)

### Tuning the nodes and the edges

In [20]:
if '' in G_improved.nodes():
    G_improved.remove_node('')

In [21]:
hypernyms = {x[0] for x in G_improved.edges()}
isolated_nodes = list(nx.isolates(G_improved))

for isolated_node in isolated_nodes:
    terms = isolated_node.split('_')
    if terms[-1] in hypernyms:
        G_improved.add_edge(terms[-1], isolated_node)
    elif terms[0] in hypernyms:
        G_improved.add_edge(terms[0], isolated_node)
    else:
        G_improved.remove_node(isolated_node)

## Results visualization

### Clusters

In [None]:
def visualize_clusters(graph):
    """ Clusterize the nodes of a particular domain in a given graph """
    graph_cluster = chinese_whispers(graph, weighting='top', iterations=60)
    
    # Visualize the clustering of graph_cluster using NetworkX (requires matplotlib)
    colors = [1. / graph_cluster.node[node]['label'] for node in graph_cluster.nodes()]
    fig = plt.gcf()
    fig.set_size_inches(20, 20)
    nx.draw_networkx(graph_cluster, cmap=plt.get_cmap('jet'), node_color=colors, font_color='black')
    plt.show()

In [None]:
GC_improved = create_children_clusters(G_improved)

In [None]:
domain = 'physics'

In [None]:
# Original clusters
visualize_clusters(GC[domain])

In [None]:
# Clusters after detaching
visualize_clusters(GC_detached[domain])

In [None]:
# Clusters after detaching and re-attaching the clusters
visualize_clusters(GC_improved[domain])

### Taxonomy

In [None]:
# View the original taxonomy
display_taxonomy(G)

In [None]:
# View the modified taxonomy
display_taxonomy(G_improved)

## Save the result

In [22]:
df_improved = pd.DataFrame(list(G_improved.edges()), columns=['hypernym', 'hyponym'])
df_improved = df_improved[df_improved.columns.tolist()[::-1]]

# Replace the underscores with blanks
df_improved['hyponym'] = df_improved['hyponym'].apply(lambda x: x.replace('_', ' '))
df_improved['hypernym'] = df_improved['hypernym'].apply(lambda x: x.replace('_', ' '))

In [23]:
file_path = os.path.splitext(input_path)
output_path = '../taxi_output/distributional_semantics/' + file_path[0].split('/')[-1] + '-semantic-removal' + file_path[1]
df_improved.to_csv(output_path, sep='\t', header=False)