## Normalization of Russian Regions Names

☑ Basic class

☑ Read etalon names from YAML file

☐ Test threshold values

☐ Add additional data for every normalized name


In [20]:
import os
import yaml
from fuzzywuzzy import fuzz
import pandas as pd
import re
from nltk.stem.snowball import SnowballStemmer

In [21]:
os.chdir('d:/coding/0_be_precise/reg_normalizer/data/interim')

In [24]:
def read_yaml(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = yaml.safe_load(file)
    return data

In [25]:
etalon_regions = read_yaml('regions_etalon_v2.0.yaml')

In [4]:
# Homoglyph substitution dictionary (Latin to Cyrillic)
LATIN_TO_CYRILLIC = {
    'A': 'А', 'B': 'В', 'C': 'С', 'E': 'Е', 'H': 'Н',
    'I': 'І', 'J': 'Ј', 'K': 'К', 'M': 'М', 'O': 'О',
    'P': 'Р', 'S': 'С', 'T': 'Т', 'X': 'Х', 'Y': 'У',
    'a': 'а', 'b': 'в', 'c': 'с', 'e': 'е', 'i': 'і',
    'j': 'ј', 'k': 'к', 'm': 'м', 'o': 'о', 'p': 'р',
    's': 'с', 't': 'т', 'x': 'х', 'y': 'у'
}

def preprocess_name(name: str) -> str:
    """Normalize and clean region names for comparison."""
    if not isinstance(name, str):
        return ''
    
    for latin, cyrillic in LATIN_TO_CYRILLIC.items():
        name = name.replace(latin, cyrillic)
    
    name = re.sub(r'[-–—]+', ' ', name)  
    name = re.sub(r'\s+', ' ', name).strip().lower()
    return name

def stem_region_name(name: str) -> str:
    """Stem Russian words using Snowball stemmer"""
    if not name:
        return ''
    stemmer = SnowballStemmer('russian')
    words = name.split()
    return ' '.join([stemmer.stem(word) for word in words])

class RegionMatcher:
    def __init__(self, etalon_regions):
        self.etalon = list(etalon_regions)
        # Precompute both preprocessed and stemmed versions
        self.preprocessed_etalon = [
            (region, preprocess_name(region), stem_region_name(preprocess_name(region))) 
            for region in self.etalon
        ]
    
    def find_best_match(self, input_name, 
                       weights=None, 
                       approach_weights=None,
                       threshold=70):
        """Find best match using combined approaches"""
        # Set default weights if not provided
        weights = weights or {'levenshtein': 0.5, 'token_set': 0.5}
        approach_weights = approach_weights or {'original': 0.5, 'stemmed': 0.5}
        
        # Preprocess input in both versions
        preprocessed_input = preprocess_name(input_name)
        stemmed_input = stem_region_name(preprocessed_input)
        
        best_match = None
        best_score = 0
        
        for etalon_name, etalon_preprocessed, etalon_stemmed in self.preprocessed_etalon:
            # Calculate original approach scores
            lev_original = fuzz.ratio(preprocessed_input, etalon_preprocessed)
            ts_original = fuzz.token_set_ratio(preprocessed_input, etalon_preprocessed)
            original_score = (weights['levenshtein'] * lev_original + 
                             weights['token_set'] * ts_original)
            
            lev_stemmed = fuzz.ratio(stemmed_input, etalon_stemmed)
            ts_stemmed = fuzz.token_set_ratio(stemmed_input, etalon_stemmed)
            stemmed_score = (weights['levenshtein'] * lev_stemmed + 
                            weights['token_set'] * ts_stemmed)

            total_score = (approach_weights['original'] * original_score +
                          approach_weights['stemmed'] * stemmed_score)
            
            if total_score > best_score:
                best_score = total_score
                best_match = etalon_name
        
        return best_match if best_score >= threshold else None
    
    def match_dataframe(self, df, column_name, **kwargs):
        """Apply matching to an entire DataFrame column"""
        df['matched_region'] = df[column_name].apply(
            lambda x: self.find_best_match(x, **kwargs)
        )
        return df

In [9]:
# Initialize with etalon regions
etalon_regions = [
    'Московская область',
    'Свердловская область',
    'Санкт-Петербург',
    'Республика Татарстан',
    'Алтайский край',
    'Республика Алтай',
    'Республика Татарстан'
]

matcher = RegionMatcher(etalon_regions)

# Sample DataFrame with messy data
data = pd.DataFrame({
    'region': [
        'московск Обл',        # Shortened form
        'свердловск',          # Without 'область'
        'петербург',           # Shortened
        'Mосковская област',   # Latin 'M' + typo
        'татарстан респ.',     # Abbreviation
        'Свердлов обл',         # Different ending
        'aлтайский к',
        'Республика     Алтай'
    ]
})

In [13]:
# Match with emphasis on stemmed approach
result = matcher.match_dataframe(
    data,
    'region',
    weights={'levenshtein': 0.4, 'token_set': 0.6},
    approach_weights={'original': 0.3, 'stemmed': 0.7},
    threshold=70
)

print(result)

                 region        matched_region
0          московск Обл    Московская область
1            свердловск  Свердловская область
2             петербург       Санкт-Петербург
3     Mосковская област    Московская область
4       татарстан респ.  Республика Татарстан
5          Свердлов обл  Свердловская область
6           aлтайский к        Алтайский край
7  Республика     Алтай      Республика Алтай
