# Функції для реальзації

In [10]:
from difflib import SequenceMatcher
import re

class FuzzyMatcher:
    def __init__(self, threshold=0.8, accurate_digits=True, case_sensitive=False):
        self.threshold = threshold
        self.accurate_digits = accurate_digits
        self.case_sensitive = case_sensitive

    def set_threshold(self, threshold):
        self.threshold = threshold

    def get_threshold(self):
        return self.threshold

    def set_accurate_digits(self, accurate_digits):
        self.accurate_digits = accurate_digits
    
    def get_accurate_digits(self):
        return self.accurate_digits

    def match(self, word1, word2):
        return self.get_similarity(word1, word2) >= self.threshold

    def get_similarity(self, word1, word2):
        if self.accurate_digits:
            if any(char.isdigit() for char in word1) or any(char.isdigit() for char in word2):
              if any(char.isdigit() for char in word1) and any(char.isdigit() for char in word2):
                if ''.join(filter(str.isdigit, word1)) != ''.join(filter(str.isdigit, word2)):
                  return 0.0
        return similarity_score(word1, word2)

    def find_best_match(self, word, word_list):

        if not self.case_sensitive:
            word_list = [word.lower() for word in word_list]
            word = word.lower()
    
        best_match = None
        highest_similarity = 0
        for candidate in word_list:
            similarity = similarity_score(word, candidate)
            if similarity > highest_similarity:
                highest_similarity = similarity
                best_match = candidate
        if highest_similarity >= self.threshold:
            return best_match
        return None

    def find_matches(self, word_list, text, indexes=False):
        matches = []
        for word in self.tokenize(text):
            best_match = self.find_best_match(word, word_list)
            if indexes:
                matches.append([text.index(word), best_match])
            else:
                matches.append(best_match)
        return [match for match in matches if match]

    def tokenize(self, text):
        return re.findall(r'\b\w+\b', text)

def similarity_score(word1, word2):
    return SequenceMatcher(None, word1, word2).ratio()


In [14]:
matcher = FuzzyMatcher(threshold=0.85, accurate_digits=True, case_sensitive=True)

In [15]:
print(matcher.match("World War I", "World War II")) 
print(matcher.get_similarity("World War I", "World War II")) 
print("-----------------")
print(matcher.match("E505", "E505")) 
print(matcher.get_similarity("E505", "E504")) 

True
0.9565217391304348
-----------------
True
0.0


In [16]:

word_list = ["E505", "E423", "E763", "word"]
text = '''
    hello world E505 E504 
    help heo 
'''
matcher.find_matches(word_list, text)

['word', 'E505']