In [1]:
import sys, os, subprocess, re
from functools import reduce

containing_folder = "./SAMPLE_FILES/"
filename = "USES.txt"
wordnet_strategy = "hypen" #"synsn"

In [12]:
class Sense:
    def __init__(self, sense_string, wn_strategy="synsn"):
        self.wn_strategy = wn_strategy
        if wn_strategy=="synsn":
            '''
            self.synonyms = {'jump', 'skip', 'run', ...}
            '''
            ret = re.split("\=\>", sense_string)
            self.synonyms = ret.pop(0)
            self.categories = ret
            self.categories = [c.strip() for c in self.categories]
            self.synonyms = set(self.synonyms.strip().split(","))

        if wn_strategy.startswith("hype"):
            '''
            self.layers = [
                subsense [..., ['whole', 'unit'], ['object', 'physical object'], ['entity']],
                subsense [..., [...], [...]]
            ]
            '''
            self.layers = []
            lines = sense_string.split("\n")

            # remove empty lines
            for i in range(len(lines)-1, -1, -1):
                if lines[i].strip() == "":
                    lines.pop(i)


            current_layer = []
            for i, line in enumerate(lines):
                if (i>0):
                    if (Sense.GetNestingSize(lines[i-1]) > Sense.GetNestingSize(line)):
                        self.layers.append(current_layer)
                        current_layer = []

                line = line.strip(" =>")
                values = re.split(",[ ]*", line)
                current_layer.append(values)
            self.layers.append(current_layer)
            self.category = self.layers.pop(0)[0][0]

        else:
            raise Exception("Unknown Wordnet Strategy:", wn_strategy)

    @staticmethod
    def GetNestingSize(line):
        return len(re.split(" \=\>", line)[0])

    @staticmethod
    def list_distance(a, b, return_match=False):
        distances = []
        for a_sense in a:
            for b_sense in b:
                distances.append(a_sense.distance(b_sense, return_match))

        if return_match:
            distances.sort(key=lambda x: x[0])
            return distances[0]
        else:
            distances.sort()
            return distances[0]

    def distance(self, other, return_match=False):
        if self.wn_strategy.startswith("hype"):
            matches = []
            for outter_layer in self.layers:
                for i, outter_value_set in enumerate(outter_layer):
                    for layer in other.layers:
                        for j, value_set in enumerate(layer):
                            if value_set == outter_value_set:
                                matches.append((i+j, value_set))

            matches.sort(key=lambda x: x[0])
            if return_match:
                return matches[0]
            else:
                return matches[0][0]
        else:
            return None

    def __repr__(self):
        if self.wn_strategy == "synsn":
            return "Sense with type {} and synonyms {}".format(self.categories, self.synonyms)
        if self.wn_strategy.startswith("hype"):
            return "Sense of {} with layer [{}, {}, {}, ..., {}, {}]".format(
                self.category, self.layers[0][0][0], self.layers[0][1][0], self.layers[0][2][0], self.layers[0][-2][0], self.layers[0][-1][0]
            )
        else:
            return None

    def __str__(self):
        return self.__repr__()

    def __contains__(self, other):
        if self.wn_strategy == "synsn":
            return other in self.synonyms
        else:
            return None

    def __iter__(self):
        return iter(list(self.synonyms))

    @staticmethod
    def check_word(word):
        proc = subprocess.run(["wn {} -{}".format(word, wordnet_strategy)], capture_output=True, shell=True)
        output = proc.stdout.decode()
        
        sense_strings = re.split("Sense [0-9]", output)
        sense_strings.pop(0)
        print("Found", len(sense_strings), "Senses for", word)

        senses = []
        for string in sense_strings:
            senses.append(Sense(string, wordnet_strategy))

        return senses

In [24]:
word_a = "elephant"
word_b = "computer"

# Get senses for each word
a = Sense.check_word(word_a)
b = Sense.check_word(word_b)
print(a)
print(b)

# Get Distance
distance = Sense.list_distance(a, b) # returns an int
detailed_distance = Sense.list_distance(a, b, return_match=True) # return_match returns a tuple: (distance, matching phrase)

print("\n==> Smallest Distance between {}, {}: {}".format(word_a, word_b, distance))
print("\tmatched on:", detailed_distance[1])


Found 2 Senses for elephant
Found 2 Senses for computer
[Sense of elephant with layer [proboscidean, placental, mammal, ..., physical entity, entity], Sense of elephant with layer [emblem, symbol, representational process, ..., abstraction, entity]]
[Sense of computer with layer [machine, device, instrumentality, ..., physical entity, entity], Sense of calculator with layer [expert, person, organism, ..., physical entity, entity]]

==> Smallest Distance between elephant, computer: 8
	matched on: ['organism', 'being']


### Hey-o Wait up
Notice that the second sense of *computer* has something to do with the old school definition of a computer as a *person* who does calculations? I'm not sure any of our participants had this sense of a computer in thier mind at any point in time. If this is the case, we could get rid of this sense of a computer and just use the machine type of computer.

In [22]:
# Remove human computer sense
b.pop(1)

Sense of calculator with layer [expert, person, organism, ..., physical entity, entity]

In [23]:
# Recalculate
detailed_distance = Sense.list_distance(a, b, True)
print("==> Smallest Distance between '{}' and '{}': {}\n\tmatched on: {}".format(word_a, word_b, detailed_distance[0], detailed_distance[1]))

==> Smallest Distance between 'elephant' and 'computer': 12
	matched on: ['whole', 'unit']


# Old Stuff
Using a synonym strategy for finding distance. This was getting annoyingly complicated so I switched over to hypernyms.

In [5]:
base = "computer"
target = "processor"

def find_distance(base, target):
    to_synonyms = lambda senses: reduce(lambda x, y: {*x.synonyms, *y.synonyms}, senses)
    print("Checking", base, target)
    print(Sense.check_word(target))

    base_syn = to_synonyms(Sense.check_word(base))
    target_syn = to_synonyms(Sense.check_word(target))
    inc = 0

    while True:
        print("Checking", base, target, "in\n\t", base_syn, "\n\t", target_syn)
        temp_target_syn = set()
        temp_base_syn = set()

        inc += 1
        # Step forward one level in base word
        for word in base_syn:
            temp = to_synonyms(Sense.check_word(word))
            temp_base_syn.union(temp)
            if len(temp & target_syn) > 0 or target in temp:
                return inc

        # Step forward one level in target word
        for word in target_syn:
            temp = to_synonyms(Sense.check_word(word))
            temp_target_syn.union(temp)
            if len(temp & base_syn) > 0 or base in temp:
                return inc

        inc += 1
        # Check if new steps caused overlap
        if len(temp_target_syn & temp_base_syn) > 0:
            return inc
        else:
            base_syn.union(temp_base_syn)
            target_syn.union(temp_target_syn)


find_distance(base, target)


Checking computer processor
Found 3 Senses for processor
[Sense of processor with layer [business, enterprise, organization, ..., abstraction, entity], Sense of processor with layer [worker, person, organism, ..., physical entity, entity], Sense of central processing unit with layer [electronic equipment, equipment, instrumentality, ..., physical entity, entity]]
Found 2 Senses for computer


AttributeError: 'Sense' object has no attribute 'synonyms'