In [None]:
class TextSimilarity:

    def __init__(self, input_files, search_docs):
        """
        TextSimilarity is called with a list of two input files and x number of search_docs
        extracts the names of the search documents from the passed file path
        __init__ calls on all functions in the TextSimilarity class
            calls upon create_dictionary() with the corpus documents -> creates set of unique words
            calls upon get_set_search() creating a tuple with two sets of the two search docs
            calls upon create_vector() to compare the search sets with the dictionary
            calls upon dist_measures() to calculate the distance measures from the vectors
        results is the variable that should be accessed to get access to the final distance measures
        """
        self.corpus = input_files
        self.dict = self.create_dictionary()
        self.search_docs = search_docs
        self.name1 = self.search_docs[0].split('/')[-1]
        self.name2 = self.search_docs[1].split('/')[-1]

        self.tuple_search = self.get_set_search()
        self.list_vector = self.create_vector()

        self.results = self.dist_measures()

    @staticmethod
    def read_files(doc):
        """
        read_files takes a text document, reads its content, replaces line breaks in the text by spaces
            and, using a for loop, adds only alphanumeric to the output string (lowercase)
        :param doc: path to text document
        :return new_string: string containing alphanumeric words seperated by space
        """
        with open(doc, 'r', encoding='utf-8') as fl:
            string_doc = fl.read().replace('\n', ' ')

        new_string = ""
        for char in string_doc:
            if char.isalpha() or char.isnumeric() or ord(char) == 32:
                new_string += char.lower()

        return new_string

    def create_dictionary(self):
        """
        create_dictionary uses the created variable 'corpus' and calls upon the read_files function
            for each of the input documents referred to in it
        all the returned strings are appended to each other
        finally a set (unique words only) is returned by splitting the string variable on whitespace
        """
        text_string = ""
        for i in self.corpus:
            text_string += self.read_files(i)

        return set(text_string.split())

    def get_set_search(self):
        """
        get_set_search uses the created variable 'search_docs' and passes the two search docs
            to the read_files function
        finally, a tuple of two sets (unique words) is returned (one for each search doc)
            splitting the string variables on whitespace
        """
        search_string1 = self.read_files(self.search_docs[0])
        search_string2 = self.read_files(self.search_docs[1])

        return set(search_string1.split()), set(search_string2.split())

    def create_vector(self):
        """
        create_vector compares the two sets of search documents ('tuple_search') against
            the created dictionary ('dict') - iterating over 'tuple_search' and the words in 'dict'
        if a word in the dictionary is also in the search document's set, a 1 is appended to the list
            and 0 otherwise => it creates two lists with as many elements as the dictionary
        finally the two lists are added to a list which is returned ('list_vectors')
        """
        list_vectors = []
        for i in range(len(self.tuple_search)):
            one_zero = []
            for word in self.dict:
                if word in self.tuple_search[i]:
                    one_zero.append(1)
                else:
                    one_zero.append(0)
            list_vectors.append(np.array(one_zero))

        return list_vectors

    def dist_measures(self):
        """
        uses the created lists (vectors of 1s and 0s) to calculate different distance measures
            1. uses NumPy's dot function to calculate the dotproduct of the vectors
            2. uses NumPy's dot and subtract functions to calculate the euclidean distance
            3. uses the set method intersect to calculate the intersection between the two search sets
                using the intersections and the calculated union Jaccard's Similarity is calculated
            4. uses NumPy's dot function and further calculations to first calculate the lengths of each
                vector and subsequently, the cosine between the vectors (Cosine Similarity)
        """
        # dotproduct
        dot = np.dot(self.list_vector[0], self.list_vector[1])
        # euclidean distance, rounded to 4 digits
        ed = round(np.dot(np.subtract(self.list_vector[0], self.list_vector[1]),
                          np.subtract(self.list_vector[0], self.list_vector[1])) ** 0.5, 4)
        # Jaccard Similarity, rounded to 4 digits
        intersection = self.tuple_search[0].intersection(self.tuple_search[1])
        union = len(self.tuple_search[0]) + len(self.tuple_search[1]) - len(intersection)
        js = round(len(intersection) / union, 4)
        # Cosine Similarity, rounded to 4 digits
        len_v1 = (np.dot(self.list_vector[0], self.list_vector[0])) ** 0.5
        len_v2 = (np.dot(self.list_vector[1], self.list_vector[1])) ** 0.5
        cs = round(dot / (len_v1 * len_v2), 4)

        return dot, ed, js, cs

In [None]:
files = ['Supplementary Files/text1q3.txt',
         'Supplementary Files/text2q3.txt',
         'Supplementary Files/text3q3.txt']
search_list1 = ['Supplementary Files/search_doc.txt',
                'Supplementary Files/text1q3.txt']
search_list2 = ['Supplementary Files/search_doc.txt',
                'Supplementary Files/text2q3.txt']
search_list3 = ['Supplementary Files/search_doc.txt',
                'Supplementary Files/text3q3.txt']
search_list4 = ['Supplementary Files/test',
                'Supplementary Files/test2']
# storing the instantiation of the classes in the list 'comparisons'
comparisons = [TextSimilarity(files, search_list1),
               TextSimilarity(files, search_list2),
               TextSimilarity(files, search_list3)]

In [None]:
def compare_results(measures):
    """
    stores the outputs of the class ('results') for each comparison made (& for each distance measure)
    sorts the lists by values (the best first) => High first except for euclidean distance
    give the lists a 'label' by storing them in a dictionary
    create a scoring list that gives each comparison a score based on their index in the lists of
        the distance measures
    returns the list of distance measures and the scoring list
    """
    dots = [(i.results[0], i.name2) for i in measures]
    eds = [(i.results[1], i.name2) for i in measures]
    jss = [(i.results[2], i.name2) for i in measures]
    css = [(i.results[3], i.name2) for i in measures]

    dots.sort(reverse=True)
    eds.sort()
    jss.sort(reverse=True)
    css.sort(reverse=True)

    dots_dict = {'dots': dots}
    eds_dict = {'eds': eds}
    jss_dict = {'jss': jss}
    css_dict = {'css': css}

    scoring = {}
    for k in measures:
        scoring[k.name2] = dots.index((k.results[0], k.name2)) + \
                         eds.index((k.results[1], k.name2)) + \
                         jss.index((k.results[2], k.name2)) + \
                         css.index((k.results[3], k.name2))

    return dots_dict, eds_dict, jss_dict, css_dict, scoring

# print the results on a line by line basis
for line in compare_results(comparisons):
    print(line)