## –ò–º–ø–æ—Ä—Ç –±–∏–±–ª–∏–æ—Ç–µ–∫ –∏ –≤—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω—ã–µ —Ñ—É–Ω–∫—Ü–∏–∏

In [1]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz, process
from rapidfuzz import process as rapid_process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import multiprocessing as mp
from functools import partial

def preprocess_text(text):
    """–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_keywords(text):
    """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –∫–ª—é—á–µ–≤—ã—Ö —Å–ª–æ–≤ –∏–∑ –æ–ø–∏—Å–∞–Ω–∏—è"""
    text = preprocess_text(text)
    words = text.split()
    stop_words = {'–¥–ª—è', '–∏–∑', '–≤', '–Ω–∞', '—Å', '–ø–æ', '–∏', '–∏–ª–∏', '–Ω–µ', '–æ—Ç', '–¥–æ', '–±–µ–∑', '–ø–æ–¥', '–Ω–∞–¥'}
    keywords = [word for word in words if len(word) > 2 and word not in stop_words]
    return set(keywords)

def jaccard_similarity(set1, set2):
    """–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç–∞ –ñ–∞–∫–∫–∞—Ä–∞"""
    if not set1 or not set2:
        return 0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

def find_best_match_fuzzy(text, choices, threshold=80):
    """–ù–∞—Ö–æ–¥–∏—Ç –ª—É—á—à–µ–µ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–µ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º fuzzy matching"""
    matches = process.extract(text, choices, scorer=fuzz.token_set_ratio, limit=5)
    if matches and matches[0][1] >= threshold:
        return matches[0]
    else:
        return (None, 0)



## –§—É–Ω–∫—Ü–∏–∏ —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è

In [2]:
def match_dataframes_fuzzy(tmc_df, sup_df, threshold=80):
    """–°–æ–ø–æ—Å—Ç–∞–≤–ª—è–µ—Ç –¥–≤–∞ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º fuzzy matching"""
    sup_descriptions = sup_df['FULL_NAME/ru_RU'].tolist()
    
    results = []
    total_records = len(tmc_df)
    
    print("–ó–∞–ø—É—Å–∫ Fuzzy Matching...")
    print(f"–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: {total_records:,}")
    print("-" * 50)
    
    for idx in range(total_records):
        row = tmc_df.iloc[idx]
        tmc_desc = row['FULL_NAME/ru_RU']
        
        # –ü—Ä–æ—Å—Ç–æ–π –≤—ã–≤–æ–¥ –ø—Ä–æ–≥—Ä–µ—Å—Å–∞ –∫–∞–∂–¥—ã–µ 100 –∑–∞–ø–∏—Å–µ–π
        if idx % 100 == 0:
            percent = (idx / total_records) * 100
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {idx:,}/{total_records:,} ({percent:.1f}%)")
        
        match_result = find_best_match_fuzzy(tmc_desc, sup_descriptions, threshold)
        best_match, score = match_result
        
        if best_match:
            sup_match = sup_df[sup_df['FULL_NAME/ru_RU'] == best_match]
            if not sup_match.empty:
                sup_row = sup_match.iloc[0]
                results.append({
                    'TMC_CSCD_ID': row['CSCD_ID'],
                    'TMC_Description': tmc_desc,
                    'SUP_CSCD_ID': sup_row['CSCD_ID'],
                    'SUP_Description': best_match,
                    'Match_Score': score
                })
    
    print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {total_records:,}/{total_records:,} (100.0%)")
    print("-" * 50)
    return pd.DataFrame(results)

def match_with_tfidf(tmc_df, sup_df, top_k=3, similarity_threshold=0.6):
    """–°–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏–µ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º TF-IDF –∏ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞"""
    
    total_records = len(tmc_df)
    print("–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤...")
    
    # –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
    tmc_texts = []
    for i in range(total_records):
        desc = tmc_df.iloc[i]['FULL_NAME/ru_RU']
        tmc_texts.append(preprocess_text(desc))
        if i % 100 == 0:
            print(f"TMC: {i:,}/{total_records:,}")
    
    sup_texts = []
    sup_total = len(sup_df)
    for i in range(sup_total):
        desc = sup_df.iloc[i]['FULL_NAME/ru_RU']
        sup_texts.append(preprocess_text(desc))
        if i % 1000 == 0:
            print(f"SUP: {i:,}/{sup_total:,}")
    
    print("–°–æ–∑–¥–∞–Ω–∏–µ TF-IDF –º–∞—Ç—Ä–∏—Ü—ã...")
    vectorizer = TfidfVectorizer(min_df=1, max_df=0.8, ngram_range=(1, 2), max_features=10000)
    all_texts = tmc_texts + sup_texts
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    tmc_tfidf = tfidf_matrix[:len(tmc_texts)]
    sup_tfidf = tfidf_matrix[len(tmc_texts):]
    
    print("–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ Nearest Neighbors...")
    nbrs = NearestNeighbors(n_neighbors=min(top_k, len(sup_texts)), metric='cosine')
    nbrs.fit(sup_tfidf)
    
    print("–ü–æ–∏—Å–∫ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π...")
    distances, indices = nbrs.kneighbors(tmc_tfidf)
    
    results = []
    
    for i in range(len(tmc_texts)):
        dist_list = distances[i]
        idx_list = indices[i]
        
        tmc_desc = tmc_df.iloc[i]['FULL_NAME/ru_RU']
        tmc_id = tmc_df.iloc[i]['CSCD_ID']
        
        if i % 100 == 0:
            percent = (i / total_records) * 100
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {i:,}/{total_records:,} ({percent:.1f}%)")
        
        for j in range(len(dist_list)):
            dist = dist_list[j]
            sup_idx = idx_list[j]
            similarity = 1 - dist
            
            if similarity >= similarity_threshold and sup_idx < len(sup_df):
                sup_row = sup_df.iloc[sup_idx]
                results.append({
                    'TMC_CSCD_ID': tmc_id,
                    'TMC_Description': tmc_desc,
                    'SUP_CSCD_ID': sup_row['CSCD_ID'],
                    'SUP_Description': sup_row['FULL_NAME/ru_RU'],
                    'Similarity_Score': similarity,
                    'Rank': j + 1
                })
    
    print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {total_records:,}/{total_records:,} (100.0%)")
    return pd.DataFrame(results)

def combined_matching(tmc_df, sup_df, weight_fuzzy=0.6, weight_jaccard=0.4, search_limit=500):
    """–ö–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –ø–æ–¥—Ö–æ–¥ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –º–µ—Ç—Ä–∏–∫"""
    
    results = []
    total_records = len(tmc_df)
    
    print("–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö SUP...")
    sup_descriptions = sup_df['FULL_NAME/ru_RU'].tolist()
    
    # –ü—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ –≤—ã—á–∏—Å–ª—è–µ–º –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è SUP
    sup_keywords = []
    actual_search_limit = min(search_limit, len(sup_df))
    for i in range(actual_search_limit):
        desc = sup_df.iloc[i]['FULL_NAME/ru_RU']
        sup_keywords.append(extract_keywords(desc))
        if i % 100 == 0:
            print(f"–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ SUP: {i:,}/{actual_search_limit:,}")
    
    print("–û–±—Ä–∞–±–æ—Ç–∫–∞ TMC –∑–∞–ø–∏—Å–µ–π...")
    
    for idx in range(total_records):
        tmc_row = tmc_df.iloc[idx]
        tmc_desc = tmc_row['FULL_NAME/ru_RU']
        tmc_keywords = extract_keywords(tmc_desc)
        
        best_match = None
        best_score = 0
        best_sup_idx = -1
        
        for sup_idx in range(actual_search_limit):
            sup_row = sup_df.iloc[sup_idx]
            sup_desc = sup_row['FULL_NAME/ru_RU']
            
            # Fuzzy matching score
            fuzzy_score = fuzz.token_set_ratio(tmc_desc, sup_desc) / 100
            
            # Jaccard similarity
            jaccard_score = jaccard_similarity(tmc_keywords, sup_keywords[sup_idx])
            
            # –ö–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–π score
            combined_score = (weight_fuzzy * fuzzy_score + 
                            weight_jaccard * jaccard_score)
            
            if combined_score > best_score:
                best_score = combined_score
                best_match = sup_row
                best_sup_idx = sup_idx
        
        if idx % 100 == 0:
            percent = (idx / total_records) * 100
            matches_found = len(results)
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {idx:,}/{total_records:,} ({percent:.1f}%), –Ω–∞–π–¥–µ–Ω–æ: {matches_found}")
        
        if best_match is not None and best_score > 0.5:
            results.append({
                'TMC_CSCD_ID': tmc_row['CSCD_ID'],
                'TMC_Description': tmc_desc,
                'SUP_CSCD_ID': best_match['CSCD_ID'],
                'SUP_Description': best_match['FULL_NAME/ru_RU'],
                'Combined_Score': best_score,
                'Fuzzy_Score': fuzz.token_set_ratio(tmc_desc, best_match['FULL_NAME/ru_RU']) / 100,
                'Jaccard_Score': jaccard_similarity(tmc_keywords, sup_keywords[best_sup_idx])
            })
    
    print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {total_records:,}/{total_records:,} (100.0%)")
    return pd.DataFrame(results)

def parallel_fuzzy_match(args):
    """–ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ —á–∞–Ω–∫–æ–≤ –¥–∞–Ω–Ω—ã—Ö"""
    tmc_chunk, sup_descriptions, threshold, chunk_id, total_chunks = args
    results = []
    
    for tmc_desc in tmc_chunk:
        match = rapid_process.extractOne(
            tmc_desc, 
            sup_descriptions, 
            scorer=fuzz.token_set_ratio, 
            score_cutoff=threshold
        )
        if match:
            results.append((tmc_desc, match[0], match[1]))
    
    return results

def parallel_matching(tmc_df, sup_df, threshold=75, n_workers=4):
    """–ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏–µ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è –æ–±—Ä–∞–±–æ—Ç–∫–∏"""
    
    tmc_descriptions = tmc_df['FULL_NAME/ru_RU'].tolist()
    sup_descriptions = sup_df['FULL_NAME/ru_RU'].tolist()
    total_records = len(tmc_descriptions)
    
    chunk_size = max(1, len(tmc_descriptions) // n_workers)
    chunks = [tmc_descriptions[i:i + chunk_size] for i in range(0, len(tmc_descriptions), chunk_size)]
    
    chunk_args = []
    for i, chunk in enumerate(chunks):
        chunk_args.append((chunk, sup_descriptions, threshold, i, len(chunks)))
    
    print(f"–ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ —Å {n_workers} workers...")
    print(f"–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π: {total_records:,}")
    print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —á–∞–Ω–∫–æ–≤: {len(chunks)}")
    
    all_results = []
    with mp.Pool(n_workers) as pool:
        for i, result in enumerate(pool.imap(parallel_fuzzy_match, chunk_args)):
            all_results.append(result)
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω —á–∞–Ω–∫ {i+1}/{len(chunks)}")
    
    flat_results = [item for sublist in all_results for item in sublist]
    
    print("–§–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ –∏—Ç–æ–≥–æ–≤–æ–≥–æ DataFrame...")
    results_df = []
    
    for tmc_desc, sup_desc, score in flat_results:
        tmc_match = tmc_df[tmc_df['FULL_NAME/ru_RU'] == tmc_desc]
        sup_match = sup_df[sup_df['FULL_NAME/ru_RU'] == sup_desc]
        
        if not tmc_match.empty and not sup_match.empty:
            tmc_row = tmc_match.iloc[0]
            sup_row = sup_match.iloc[0]
            
            results_df.append({
                'TMC_CSCD_ID': tmc_row['CSCD_ID'],
                'TMC_Description': tmc_desc,
                'SUP_CSCD_ID': sup_row['CSCD_ID'],
                'SUP_Description': sup_desc,
                'Match_Score': score
            })
    
    return pd.DataFrame(results_df)


## –§—É–Ω–∫—Ü–∏—è –∑–∞–ø—É—Å–∫–∞ –º–µ—Ç–æ–¥–æ–≤ –∏ Excel —Ñ—É–Ω–∫—Ü–∏–∏

In [3]:
def match_with_progress(tmc_df, sup_df, method='fuzzy', **kwargs):
    """
    –£–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –∑–∞–ø—É—Å–∫–∞ –º–µ—Ç–æ–¥–æ–≤
    """
    from time import time
    start_time = time()
    
    total_tmc = len(tmc_df)
    total_sup = len(sup_df)
    
    print(f"–ù–∞—á–∞–ª–æ –æ–±—Ä–∞–±–æ—Ç–∫–∏: {total_tmc:,} TMC –∑–∞–ø–∏—Å–µ–π vs {total_sup:,} SUP –∑–∞–ø–∏—Å–µ–π")
    print(f"–ú–µ—Ç–æ–¥: {method}")
    print("-" * 60)
    
    if method == 'fuzzy':
        results = match_dataframes_fuzzy(tmc_df, sup_df, **kwargs)
    elif method == 'tfidf':
        results = match_with_tfidf(tmc_df, sup_df, **kwargs)
    elif method == 'combined':
        results = combined_matching(tmc_df, sup_df, **kwargs)
    elif method == 'parallel':
        results = parallel_matching(tmc_df, sup_df, **kwargs)
    else:
        raise ValueError("–ù–µ–∏–∑–≤–µ—Å—Ç–Ω—ã–π –º–µ—Ç–æ–¥. –î–æ—Å—Ç—É–ø–Ω—ã–µ: 'fuzzy', 'tfidf', 'combined', 'parallel'")
    
    end_time = time()
    processing_time = end_time - start_time
    
    print("-" * 60)
    print("–û–ë–†–ê–ë–û–¢–ö–ê –ó–ê–í–ï–†–®–ï–ù–ê")
    print(f"–í—Ä–µ–º—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: {processing_time:.2f} —Å–µ–∫—É–Ω–¥")
    print(f"–ù–∞–π–¥–µ–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π: {len(results):,}")
    print(f"–ü—Ä–æ—Ü–µ–Ω—Ç —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è: {len(results)/total_tmc*100:.1f}%")
    
    if len(results) > 0:
        score_column = results.columns[-1]
        avg_score = results[score_column].mean()
        max_score = results[score_column].max()
        min_score = results[score_column].min()
        print(f"–°—Ä–µ–¥–Ω—è—è –æ—Ü–µ–Ω–∫–∞ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏—è: {avg_score:.3f}")
        print(f"–õ—É—á—à–µ–µ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–µ: {max_score:.3f}")
        print(f"–•—É–¥—à–µ–µ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–µ: {min_score:.3f}")
    
    return results

def save_results_to_excel(results, tmc_df, sup_df, filename, method_name):
    """
    –°–æ—Ö—Ä–∞–Ω—è–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –æ–¥–Ω–æ–≥–æ –º–µ—Ç–æ–¥–∞ –≤ Excel —Ñ–∞–π–ª
    """
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        # –û—Å–Ω–æ–≤–Ω—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
        results.to_excel(writer, sheet_name='–°–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è', index=False)
        
        if len(results) > 0:
            score_column = results.columns[-1]
            
            # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
            stats_data = {
                '–ú–µ—Ç—Ä–∏–∫–∞': [
                    '–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π TMC',
                    '–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π SUP', 
                    '–ù–∞–π–¥–µ–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π',
                    '–ü—Ä–æ—Ü–µ–Ω—Ç —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è',
                    '–°—Ä–µ–¥–Ω—è—è –æ—Ü–µ–Ω–∫–∞',
                    '–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –æ—Ü–µ–Ω–∫–∞',
                    '–ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è –æ—Ü–µ–Ω–∫–∞'
                ],
                '–ó–Ω–∞—á–µ–Ω–∏–µ': [
                    len(tmc_df),
                    len(sup_df),
                    len(results),
                    f"{len(results)/len(tmc_df)*100:.2f}%",
                    f"{results[score_column].mean():.3f}",
                    f"{results[score_column].max():.3f}",
                    f"{results[score_column].min():.3f}"
                ]
            }
            stats_df = pd.DataFrame(stats_data)
            stats_df.to_excel(writer, sheet_name='–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞', index=False)
            
            # –¢–æ–ø-10 –ª—É—á—à–∏—Ö —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π
            top_10 = results.nlargest(10, score_column)
            top_10.to_excel(writer, sheet_name='–¢–æ–ø-10 —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π', index=False)

def create_summary_excel(all_results, tmc_df, sup_df):
    """
    –°–æ–∑–¥–∞–µ—Ç —Å–≤–æ–¥–Ω—ã–π Excel —Ñ–∞–π–ª —Å–æ –≤—Å–µ–º–∏ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏ –º–µ—Ç–æ–¥–æ–≤
    """
    with pd.ExcelWriter('all_methods_comparison.xlsx', engine='openpyxl') as writer:
        
        # –°—Ä–∞–≤–Ω–µ–Ω–∏–µ –º–µ—Ç–æ–¥–æ–≤
        comparison_data = []
        for method_name, results in all_results.items():
            if len(results) > 0:
                score_column = results.columns[-1]
                comparison_data.append({
                    '–ú–µ—Ç–æ–¥': method_name.upper(),
                    '–í—Å–µ–≥–æ_—Å–æ–≤–ø–∞–¥–µ–Ω–∏–π': len(results),
                    '–ü—Ä–æ—Ü–µ–Ω—Ç_—Å–æ–≤–ø–∞–¥–µ–Ω–∏–π': f"{(len(results)/len(tmc_df)*100):.2f}%",
                    '–°—Ä–µ–¥–Ω—è—è_–æ—Ü–µ–Ω–∫–∞': f"{results[score_column].mean():.3f}",
                    '–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è_–æ—Ü–µ–Ω–∫–∞': f"{results[score_column].max():.3f}",
                    '–ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è_–æ—Ü–µ–Ω–∫–∞': f"{results[score_column].min():.3f}"
                })
        
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df.to_excel(writer, sheet_name='–°—Ä–∞–≤–Ω–µ–Ω–∏–µ –º–µ—Ç–æ–¥–æ–≤', index=False)
        
        # –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫–∞–∂–¥–æ–≥–æ –º–µ—Ç–æ–¥–∞
        for method_name, results in all_results.items():
            sheet_name = method_name[:31]
            results.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # –û–±—â–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
        stats_summary = []
        for method_name, results in all_results.items():
            if len(results) > 0:
                score_column = results.columns[-1]
                stats_summary.append({
                    '–ú–µ—Ç–æ–¥': method_name.upper(),
                    '–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π TMC': len(tmc_df),
                    '–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π SUP': len(sup_df),
                    '–ù–∞–π–¥–µ–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π': len(results),
                    '–ü—Ä–æ—Ü–µ–Ω—Ç —É—Å–ø–µ—Ö–∞': f"{(len(results)/len(tmc_df)*100):.2f}%",
                    '–°—Ä–µ–¥–Ω—è—è –æ—Ü–µ–Ω–∫–∞': results[score_column].mean()
                })
        
        stats_summary_df = pd.DataFrame(stats_summary)
        stats_summary_df.to_excel(writer, sheet_name='–û–±—â–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞', index=False)


## –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [4]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
print("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")
tmc_df = pd.read_excel('–û–ö–ü–î_1.xlsx')
sup_df = pd.read_excel('–û–ö–ü–î_2.xlsx')

üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...


## –û—Å–Ω–æ–≤–Ω–æ–π —Å–∫—Ä–∏–ø—Ç –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è

In [5]:
# –û—Å–Ω–æ–≤–Ω–æ–π —Å–∫—Ä–∏–ø—Ç
print("üöÄ –ó–ê–ü–£–°–ö –°–ò–°–¢–ï–ú–´ –°–†–ê–í–ù–ï–ù–ò–Ø –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†")
print("=" * 70)
print(f"‚úÖ TMC –∑–∞–ø–∏—Å–µ–π: {len(tmc_df):,}")
print(f"‚úÖ SUP –∑–∞–ø–∏—Å–µ–π: {len(sup_df):,}")
print()

# –û–ø—Ä–µ–¥–µ–ª—è–µ–º –º–µ—Ç–æ–¥—ã –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è
methods = [
    ('fuzzy', {'threshold': 80}),
    ('tfidf', {'top_k': 3, 'similarity_threshold': 0.5}),
    ('combined', {'search_limit': 200, 'weight_fuzzy': 0.7, 'weight_jaccard': 0.3})
]

all_results = {}

for method_name, params in methods:
    print("\n" + "=" * 70)
    print(f"üîç –ó–ê–ü–£–°–ö –ú–ï–¢–û–î–ê: {method_name.upper()}")
    print("=" * 70)
    
    try:
        results = match_with_progress(tmc_df, sup_df, method=method_name, **params)
        all_results[method_name] = results
        
        # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –≤ Excel
        filename = f'{method_name}_matching_results.xlsx'
        save_results_to_excel(results, tmc_df, sup_df, filename, method_name)
        print(f"üíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {filename}")
        
    except Exception as e:
        print(f"‚ùå –û—à–∏–±–∫–∞ –≤ –º–µ—Ç–æ–¥–µ {method_name}: {e}")
        import traceback
        traceback.print_exc()
        continue

print("\nüéâ –í–°–ï –ú–ï–¢–û–î–´ –ó–ê–í–ï–†–®–ï–ù–´!")

# –í—ã–≤–æ–¥ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
print("\nüìä –°–í–û–î–ù–ê–Ø –°–¢–ê–¢–ò–°–¢–ò–ö–ê:")
print("=" * 50)

for method_name, results in all_results.items():
    if len(results) > 0:
        score_column = results.columns[-1]
        avg_score = results[score_column].mean()
        print(f"{method_name.upper():<12} | –°–æ–≤–ø–∞–¥–µ–Ω–∏–π: {len(results):>6,} | –°—Ä–µ–¥–Ω—è—è –æ—Ü–µ–Ω–∫–∞: {avg_score:.3f}")

# –°–æ–∑–¥–∞–Ω–∏–µ —Å–≤–æ–¥–Ω–æ–≥–æ –æ—Ç—á–µ—Ç–∞
if all_results:
    print(f"\nüìä –°–û–ó–î–ê–ù–ò–ï –°–í–û–î–ù–û–ì–û –û–¢–ß–ï–¢–ê...")
    create_summary_excel(all_results, tmc_df, sup_df)
    print("üíæ –°–≤–æ–¥–Ω—ã–π –æ—Ç—á–µ—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤: all_methods_comparison.xlsx")

print(f"\nüí° –í—Å–µ–≥–æ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–æ –º–µ—Ç–æ–¥–æ–≤: {len(all_results)} –∏–∑ {len(methods)}")


üöÄ –ó–ê–ü–£–°–ö –°–ò–°–¢–ï–ú–´ –°–†–ê–í–ù–ï–ù–ò–Ø –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†
‚úÖ TMC –∑–∞–ø–∏—Å–µ–π: 351
‚úÖ SUP –∑–∞–ø–∏—Å–µ–π: 15,064


üîç –ó–ê–ü–£–°–ö –ú–ï–¢–û–î–ê: FUZZY
–ù–∞—á–∞–ª–æ –æ–±—Ä–∞–±–æ—Ç–∫–∏: 351 TMC –∑–∞–ø–∏—Å–µ–π vs 15,064 SUP –∑–∞–ø–∏—Å–µ–π
–ú–µ—Ç–æ–¥: fuzzy
------------------------------------------------------------
–ó–∞–ø—É—Å–∫ Fuzzy Matching...
–í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: 351
--------------------------------------------------
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 0/351 (0.0%)
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 100/351 (28.5%)
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 200/351 (57.0%)
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 300/351 (85.5%)
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 351/351 (100.0%)
--------------------------------------------------
------------------------------------------------------------
–û–ë–†–ê–ë–û–¢–ö–ê –ó–ê–í–ï–†–®–ï–ù–ê
–í—Ä–µ–º—è –æ–±—Ä–∞–±–æ—Ç–∫–∏: 67.07 —Å–µ–∫—É–Ω–¥
–ù–∞–π–¥–µ–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π: 262
–ü—Ä–æ—Ü–µ–Ω—Ç —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è: 74.6%
–°—Ä–µ–¥–Ω—è—è –æ—Ü–µ–Ω–∫–∞ —Å–æ–æ—Ç–≤–µ—

## –§—É–Ω–∫—Ü–∏—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [6]:
def compare_results(all_results, tmc_df):
    """–°—Ä–∞–≤–Ω–∏–≤–∞–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Ä–∞–∑–Ω—ã—Ö –º–µ—Ç–æ–¥–æ–≤"""
    comparison_data = []
    
    for method_name, results in all_results.items():
        if len(results) > 0:
            score_column = results.columns[-1]
            avg_score = results[score_column].mean()
            match_rate = len(results) / len(tmc_df) * 100
            
            comparison_data.append({
                'Method': method_name.upper(),
                'Total_Matches': len(results),
                'Match_Rate_Percent': match_rate,
                'Average_Score': avg_score,
                'Best_Match': results[score_column].max(),
                'Worst_Match': results[score_column].min()
            })
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nüìã –°–†–ê–í–ù–ï–ù–ò–ï –ú–ï–¢–û–î–û–í:")
    print("=" * 80)
    print(comparison_df.to_string(index=False, float_format='%.3f'))
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ
    comparison_df.to_excel('methods_comparison.xlsx', index=False, engine='openpyxl')
    print(f"\nüíæ –°—Ä–∞–≤–Ω–µ–Ω–∏–µ –º–µ—Ç–æ–¥–æ–≤ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤: methods_comparison.xlsx")
    
    return comparison_df

# –ó–∞–ø—É—Å–∫–∞–µ–º —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ –µ—Å–ª–∏ –µ—Å—Ç—å —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
if all_results:
    comparison = compare_results(all_results, tmc_df)

print("\nüéØ –ê–ù–ê–õ–ò–ó –ó–ê–í–ï–†–®–ï–ù!")



üìã –°–†–ê–í–ù–ï–ù–ò–ï –ú–ï–¢–û–î–û–í:
  Method  Total_Matches  Match_Rate_Percent  Average_Score  Best_Match  Worst_Match
   FUZZY            262              74.644         91.275     100.000       81.000
   TFIDF            565             160.969          1.704       3.000        1.000
COMBINED              4               1.140          0.424       0.500        0.250

üíæ –°—Ä–∞–≤–Ω–µ–Ω–∏–µ –º–µ—Ç–æ–¥–æ–≤ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤: methods_comparison.xlsx

üéØ –ê–ù–ê–õ–ò–ó –ó–ê–í–ï–†–®–ï–ù!


## –ü–æ–∏—Å–∫ –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–π –Ω–æ–º–µ–Ω–∫–ª–∞—Ç—É—Ä—ã

In [21]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
print("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")
tmc_df = pd.read_excel('TMC.xlsx')
sup_df = pd.read_excel('SUP.xlsx')
sup_df = pd.concat([tmc_df,sup_df],ignore_index=True)
print("üì• –î–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")

üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...


In [22]:
def search_nomenclature_by_description(description, tmc_df, sup_df, threshold=70, top_k=5):
    """
    –ò—â–µ—Ç –Ω–æ–º–µ–Ω–∫–ª–∞—Ç—É—Ä—É –ø–æ –æ–ø–∏—Å–∞–Ω–∏—é —Ç—Ä–µ–º—è –º–µ—Ç–æ–¥–∞–º–∏
    """
    print(f"üîç –ü–û–ò–°–ö –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†–´: '{description}'")
    print("=" * 80)
    
    results = {
        'fuzzy': [],
        'tfidf': [],
        'combined': []
    }
    
    # –ú–µ—Ç–æ–¥ 1: Fuzzy Matching
    print("\n1. FUZZY MATCHING:")
    print("-" * 40)
    sup_descriptions = sup_df['FULL_NAME/ru_RU'].tolist()
    fuzzy_matches = process.extract(description, sup_descriptions, scorer=fuzz.token_set_ratio, limit=top_k)
    
    for match_desc, score in fuzzy_matches:
        if score >= threshold:
            sup_match = sup_df[sup_df['FULL_NAME/ru_RU'] == match_desc]
            if not sup_match.empty:
                sup_row = sup_match.iloc[0]
                results['fuzzy'].append({
                    'SUP_CSCD_ID': sup_row['CSCD_ID'],
                    'SUP_Description': match_desc,
                    'Score': score,
                    'Method': 'Fuzzy'
                })
                print(f"   üìç –û—Ü–µ–Ω–∫–∞: {score:>3}% | ID: {sup_row['CSCD_ID']}")
                print(f"      –û–ø–∏—Å–∞–Ω–∏–µ: {match_desc}")
    
    # –ú–µ—Ç–æ–¥ 2: TF-IDF
    print("\n2. TF-IDF MATCHING:")
    print("-" * 40)
    
    # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
    search_text = preprocess_text(description)
    sup_texts = [preprocess_text(desc) for desc in sup_df['FULL_NAME/ru_RU']]
    
    # –°–æ–∑–¥–∞–Ω–∏–µ TF-IDF –º–∞—Ç—Ä–∏—Ü—ã
    vectorizer = TfidfVectorizer(min_df=1, max_df=0.8, ngram_range=(1, 2))
    all_texts = [search_text] + sup_texts
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã
    search_tfidf = tfidf_matrix[0]
    sup_tfidf = tfidf_matrix[1:]
    
    # –í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(search_tfidf, sup_tfidf).flatten()
    
    # –ü–æ–ª—É—á–µ–Ω–∏–µ —Ç–æ–ø-K —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    for idx in top_indices:
        similarity = similarities[idx]
        if similarity >= 0.3:  # –ü–æ—Ä–æ–≥ –¥–ª—è TF-IDF
            sup_row = sup_df.iloc[idx]
            results['tfidf'].append({
                'SUP_CSCD_ID': sup_row['CSCD_ID'],
                'SUP_Description': sup_row['FULL_NAME/ru_RU'],
                'Score': similarity * 100,
                'Method': 'TF-IDF'
            })
            print(f"   üìç –û—Ü–µ–Ω–∫–∞: {similarity*100:>5.1f}% | ID: {sup_row['CSCD_ID']}")
            print(f"      –û–ø–∏—Å–∞–Ω–∏–µ: {sup_row['FULL_NAME/ru_RU']}")
    
    # –ú–µ—Ç–æ–¥ 3: –ö–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–π
    print("\n3. COMBINED MATCHING:")
    print("-" * 40)
    
    search_keywords = extract_keywords(description)
    combined_results = []
    
    for idx, sup_row in sup_df.iterrows():
        sup_desc = sup_row['FULL_NAME/ru_RU']
        sup_keywords = extract_keywords(sup_desc)
        
        # Fuzzy score
        fuzzy_score = fuzz.token_set_ratio(description, sup_desc) / 100
        
        # Jaccard similarity
        jaccard_score = jaccard_similarity(search_keywords, sup_keywords)
        
        # Combined score (–≤–∑–≤–µ—à–µ–Ω–Ω–æ–µ —Å—Ä–µ–¥–Ω–µ–µ)
        combined_score = (0.6 * fuzzy_score + 0.4 * jaccard_score) * 100
        
        if combined_score >= threshold:
            combined_results.append({
                'sup_row': sup_row,
                'combined_score': combined_score,
                'fuzzy_score': fuzzy_score * 100,
                'jaccard_score': jaccard_score * 100
            })
    
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º –∏ –±–µ—Ä–µ–º —Ç–æ–ø-K
    combined_results.sort(key=lambda x: x['combined_score'], reverse=True)
    
    for i, result in enumerate(combined_results[:top_k]):
        sup_row = result['sup_row']
        results['combined'].append({
            'SUP_CSCD_ID': sup_row['CSCD_ID'],
            'SUP_Description': sup_row['FULL_NAME/ru_RU'],
            'Score': result['combined_score'],
            'Method': 'Combined',
            'Fuzzy_Score': result['fuzzy_score'],
            'Jaccard_Score': result['jaccard_score']
        })
        print(f"   üìç –û—Ü–µ–Ω–∫–∞: {result['combined_score']:>5.1f}% | ID: {sup_row['CSCD_ID']}")
        print(f"      Fuzzy: {result['fuzzy_score']:.1f}% | Jaccard: {result['jaccard_score']:.1f}%")
        print(f"      –û–ø–∏—Å–∞–Ω–∏–µ: {sup_row['FULL_NAME/ru_RU']}")
    
    return results

def save_search_results_to_excel(search_description, results, filename):
    """
    –°–æ—Ö—Ä–∞–Ω—è–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞ –≤ Excel —Ñ–∞–π–ª
    """
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        
        # –°–≤–æ–¥–Ω–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –ø–æ–∏—Å–∫–µ
        summary_data = {
            '–ü–∞—Ä–∞–º–µ—Ç—Ä': ['–ü–æ–∏—Å–∫–æ–≤—ã–π –∑–∞–ø—Ä–æ—Å', '–î–∞—Ç–∞ –ø–æ–∏—Å–∫–∞', '–í—Å–µ–≥–æ –Ω–∞–π–¥–µ–Ω–æ Fuzzy', '–í—Å–µ–≥–æ –Ω–∞–π–¥–µ–Ω–æ TF-IDF', '–í—Å–µ–≥–æ –Ω–∞–π–¥–µ–Ω–æ Combined'],
            '–ó–Ω–∞—á–µ–Ω–∏–µ': [
                search_description,
                pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
                len(results['fuzzy']),
                len(results['tfidf']),
                len(results['combined'])
            ]
        }
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_excel(writer, sheet_name='–°–≤–æ–¥–∫–∞', index=False)
        
        # –†–µ–∑—É–ª—å—Ç–∞—Ç—ã Fuzzy Matching
        if results['fuzzy']:
            fuzzy_df = pd.DataFrame(results['fuzzy'])
            fuzzy_df.to_excel(writer, sheet_name='Fuzzy_Results', index=False)
        
        # –†–µ–∑—É–ª—å—Ç–∞—Ç—ã TF-IDF
        if results['tfidf']:
            tfidf_df = pd.DataFrame(results['tfidf'])
            tfidf_df.to_excel(writer, sheet_name='TFIDF_Results', index=False)
        
        # –†–µ–∑—É–ª—å—Ç–∞—Ç—ã Combined
        if results['combined']:
            combined_df = pd.DataFrame(results['combined'])
            combined_df.to_excel(writer, sheet_name='Combined_Results', index=False)
        
        # –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤–º–µ—Å—Ç–µ
        all_results = []
        for method in ['fuzzy', 'tfidf', 'combined']:
            for result in results[method]:
                all_results.append(result)
        
        if all_results:
            all_df = pd.DataFrame(all_results)
            all_df.to_excel(writer, sheet_name='–í—Å–µ_—Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã', index=False)
            
            # –¢–æ–ø-10 –ª—É—á—à–∏—Ö —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –ø–æ –≤—Å–µ–º –º–µ—Ç–æ–¥–∞–º
            top_10 = all_df.nlargest(10, 'Score')
            top_10.to_excel(writer, sheet_name='–¢–æ–ø-10', index=False)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞
def interactive_search():
    """
    –ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–∏—Å–∫–∞ –Ω–æ–º–µ–Ω–∫–ª–∞—Ç—É—Ä
    """
    print("üéØ –ò–ù–¢–ï–†–ê–ö–¢–ò–í–ù–´–ô –ü–û–ò–°–ö –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†–´")
    print("=" * 50)
    
    while True:
        print("\n–í–≤–µ–¥–∏—Ç–µ –æ–ø–∏—Å–∞–Ω–∏–µ –Ω–æ–º–µ–Ω–∫–ª–∞—Ç—É—Ä—ã –¥–ª—è –ø–æ–∏—Å–∫–∞ (–∏–ª–∏ '–≤—ã—Ö–æ–¥' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è):")
        search_query = input("> ").strip()
        
        if search_query.lower() in ['–≤—ã—Ö–æ–¥', 'exit', 'quit']:
            print("–ó–∞–≤–µ—Ä—à–µ–Ω–∏–µ —Ä–∞–±–æ—Ç—ã...")
            break
        
        if not search_query:
            print("‚ùå –ü—É—Å—Ç–æ–π –∑–∞–ø—Ä–æ—Å. –ü–æ–ø—Ä–æ–±—É–π—Ç–µ —Å–Ω–æ–≤–∞.")
            continue
        
        print(f"\nüîç –ü–æ–∏—Å–∫: '{search_query}'")
        print("‚è≥ –û–±—Ä–∞–±–æ—Ç–∫–∞...")
        
        try:
            # –í—ã–ø–æ–ª–Ω—è–µ–º –ø–æ–∏—Å–∫
            results = search_nomenclature_by_description(search_query, tmc_df, sup_df, threshold=70, top_k=5)
            
            # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
            timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
            filename = f'search_results_{timestamp}.xlsx'
            save_search_results_to_excel(search_query, results, filename)
            
            print(f"\nüíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {filename}")
            
            # –°–≤–æ–¥–Ω–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
            total_matches = len(results['fuzzy']) + len(results['tfidf']) + len(results['combined'])
            print(f"\nüìä –°–≤–æ–¥–Ω–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:")
            print(f"   ‚Ä¢ Fuzzy Matching: {len(results['fuzzy'])} —Å–æ–≤–ø–∞–¥–µ–Ω–∏–π")
            print(f"   ‚Ä¢ TF-IDF Matching: {len(results['tfidf'])} —Å–æ–≤–ø–∞–¥–µ–Ω–∏–π")
            print(f"   ‚Ä¢ Combined Matching: {len(results['combined'])} —Å–æ–≤–ø–∞–¥–µ–Ω–∏–π")
            print(f"   ‚Ä¢ –í—Å–µ–≥–æ –Ω–∞–π–¥–µ–Ω–æ: {total_matches} —Å–æ–≤–ø–∞–¥–µ–Ω–∏–π")
            
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–∏—Å–∫–µ: {e}")
            import traceback
            traceback.print_exc()

# –ó–∞–ø—É—Å–∫ –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞
print("üéØ –ì–û–¢–û–í –ö –ü–û–ò–°–ö–£ –ö–û–ù–ö–†–ï–¢–ù–´–• –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†!")
print("=" * 60)




üéØ –ì–û–¢–û–í –ö –ü–û–ò–°–ö–£ –ö–û–ù–ö–†–ï–¢–ù–´–• –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†!


In [24]:
test_descriptions = [
    "–¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π 500 –¢–ü–ì-4,0-–ú1/25–ì-3-–ö-2-–•–õ-–ò",
    "–¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π 1-EW-3001/1,2"
]

In [28]:
# –ü—Ä–∏–º–µ—Ä –ø–æ–∏—Å–∫–∞ –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–π –Ω–æ–º–µ–Ω–∫–ª–∞—Ç—É—Ä—ã

print("\nüìã –ü—Ä–∏–º–µ—Ä—ã –¥–ª—è –±—ã—Å—Ç—Ä–æ–≥–æ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è:")
for i, desc in enumerate(test_descriptions, 1):
    print(f"   {i}. {desc}")

print("\n–í—ã–±–µ—Ä–∏—Ç–µ –≤–∞—Ä–∏–∞–Ω—Ç:")
print("1. –ó–∞–ø—É—Å—Ç–∏—Ç—å –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π –ø–æ–∏—Å–∫")
print("2. –ü—Ä–æ—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞—Ç—å –Ω–∞ –ø—Ä–∏–º–µ—Ä–µ '–¢—Ä—É–±–∞ –æ–±—Å–∞–¥–Ω–∞—è'")
print("3. –í–≤–µ—Å—Ç–∏ —Å–≤–æ–π –∑–∞–ø—Ä–æ—Å")

choice = input("–í–∞—à –≤—ã–±–æ—Ä (1-3): ").strip()

if choice == "1":
    interactive_search()
elif choice == "2":
    search_query = "–¢—Ä—É–±–∞ –æ–±—Å–∞–¥–Ω–∞—è"
    print(f"\nüîç –¢–ï–°–¢–û–í–´–ô –ü–û–ò–°–ö: '{search_query}'")
    results = search_nomenclature_by_description(search_query, tmc_df, sup_df, threshold=70, top_k=5)
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
    filename = f'search_example_{pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")}.xlsx'
    save_search_results_to_excel(search_query, results, filename)
    print(f"\nüíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {filename}")
    
elif choice == "3":
    custom_query = input("–í–≤–µ–¥–∏—Ç–µ –æ–ø–∏—Å–∞–Ω–∏–µ –¥–ª—è –ø–æ–∏—Å–∫–∞: ").strip()
    if custom_query:
        results = search_nomenclature_by_description(custom_query, tmc_df, sup_df, threshold=70, top_k=5)
        filename = f'search_custom_{pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")}.xlsx'
        save_search_results_to_excel(custom_query, results, filename)
        print(f"\nüíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {filename}")
    else:
        print("‚ùå –ü—É—Å—Ç–æ–π –∑–∞–ø—Ä–æ—Å.")
else:
    print("‚ùå –ù–µ–≤–µ—Ä–Ω—ã–π –≤—ã–±–æ—Ä.")


üìã –ü—Ä–∏–º–µ—Ä—ã –¥–ª—è –±—ã—Å—Ç—Ä–æ–≥–æ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è:
   1. –¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π 500 –¢–ü–ì-4,0-–ú1/25–ì-3-–ö-2-–•–õ-–ò
   2. –¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π 1-EW-3001/1,2

–í—ã–±–µ—Ä–∏—Ç–µ –≤–∞—Ä–∏–∞–Ω—Ç:
1. –ó–∞–ø—É—Å—Ç–∏—Ç—å –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π –ø–æ–∏—Å–∫
2. –ü—Ä–æ—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞—Ç—å –Ω–∞ –ø—Ä–∏–º–µ—Ä–µ '–¢—Ä—É–±–∞ –æ–±—Å–∞–¥–Ω–∞—è'
3. –í–≤–µ—Å—Ç–∏ —Å–≤–æ–π –∑–∞–ø—Ä–æ—Å


üîç –ü–û–ò–°–ö –ù–û–ú–ï–ù–ö–õ–ê–¢–£–†–´: '–ö–æ—Å—Ç—é–º –º—É–∂—Å–∫–æ–π —Ä–∞–∑–º–µ—Ä 50 –∑–∏–º–Ω–∏–π —É—Ç–µ–ø–ª–µ–Ω–Ω—ã–π'

1. FUZZY MATCHING:
----------------------------------------
   üìç –û—Ü–µ–Ω–∫–∞: 100% | ID: 2140291
      –û–ø–∏—Å–∞–Ω–∏–µ: –ö–æ—Å—Ç—é–º –∑–∏–º–Ω–∏–π
   üìç –û—Ü–µ–Ω–∫–∞: 100% | ID: 3316202
      –û–ø–∏—Å–∞–Ω–∏–µ: –ö–û–°–¢–Æ–ú –£–¢–ï–ü–õ–ï–ù–ù–´–ô
   üìç –û—Ü–µ–Ω–∫–∞:  91% | ID: 3267123
      –û–ø–∏—Å–∞–Ω–∏–µ: –ö–æ—Å—Ç—é–º –∑–∏–º–Ω–∏–π –º—É–∂—Å–∫–æ–π –±/—É
   üìç –û—Ü–µ–Ω–∫–∞:  91% | ID: 1527594
      –û–ø–∏—Å–∞–Ω–∏–µ: –ö–æ—Å—Ç—é–º –∑–∏–º–Ω–∏–π –º—É–∂—Å–∫–æ–π –ú–ß–°
   üìç –û—Ü–µ–Ω–∫–∞:  91% | ID: 3427670
      –û–ø–∏—Å–∞–Ω–∏–µ: –ö–æ—Å—Ç—é–º –º—É–∂—Å–∫–æ–π –∑–∏–º–Ω–∏–π –ú–ß–°

2. TF-IDF MATCHING:
----------------------------------------
   üìç –û—Ü–µ–Ω–∫–∞:  33.4% | ID: 3073156
      –û–ø–∏—Å–∞–Ω–∏–µ: –ö–æ—Å—Ç—é–º —Ö/–± –º—É–∂—Å–∫–æ–π

3. COMBINED MATCHING:
----------------------------------------
   üìç –û—Ü–µ–Ω–∫–∞:  78.6% | ID: 3267123
      Fuzzy: 91.0% 

## –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ —É—Ç–∏–ª–∏—Ç—ã –¥–ª—è –ø–æ–∏—Å–∫–∞

In [20]:
def quick_search(description, top_k=3):
    """
    –ë—ã—Å—Ç—Ä—ã–π –ø–æ–∏—Å–∫ –ø–æ –æ–¥–Ω–æ–º—É –æ–ø–∏—Å–∞–Ω–∏—é —Å –≤—ã–≤–æ–¥–æ–º –≤ –∫–æ–Ω—Å–æ–ª—å
    """
    print(f"\nüîç –ë–´–°–¢–†–´–ô –ü–û–ò–°–ö: '{description}'")
    print("=" * 60)
    
    sup_descriptions = sup_df['FULL_NAME/ru_RU'].tolist()
    
    # –ü—Ä–æ—Å—Ç–æ–π Fuzzy search
    matches = process.extract(description, sup_descriptions, scorer=fuzz.token_set_ratio, limit=top_k)
    
    print("üéØ –õ–£–ß–®–ò–ï –°–û–û–¢–í–ï–¢–°–¢–í–ò–Ø:")
    for i, (match_desc, score) in enumerate(matches, 1):
        sup_match = sup_df[sup_df['FULL_NAME/ru_RU'] == match_desc]
        if not sup_match.empty:
            sup_row = sup_match.iloc[0]
            print(f"\n{i}. üìç –û—Ü–µ–Ω–∫–∞: {score}%")
            print(f"   ID: {sup_row['CSCD_ID']}")
            print(f"   –û–ø–∏—Å–∞–Ω–∏–µ: {match_desc}")
            
            # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –≤ TMC
            tmc_match = tmc_df[tmc_df['FULL_NAME/ru_RU'].str.contains(description, case=False, na=False)]
            if not tmc_match.empty:
                print(f"   ‚úÖ –ï—Å—Ç—å –≤ TMC: {len(tmc_match)} –∑–∞–ø–∏—Å–µ–π")
            else:
                print(f"   ‚ùå –ù–µ—Ç –≤ TMC")

# –ü—Ä–∏–º–µ—Ä—ã –±—ã—Å—Ç—Ä–æ–≥–æ –ø–æ–∏—Å–∫–∞
print("üöÄ –ë–´–°–¢–†–´–ô –ü–û–ò–°–ö - –¢–ï–°–¢–û–í–´–ï –ü–†–ò–ú–ï–†–´")
print("=" * 50)

test_queries = [
    "–¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π –¢-119",
    "–¢—Ä—É–±–∞ –æ–±—Å–∞–¥–Ω–∞—è 114"
]

for query in test_queries:
    quick_search(query)
    print("\n" + "-" * 50)

print("\nüéØ –î–ª—è –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞ –∏—Å–ø–æ–ª—å–∑—É–π—Ç–µ —Ñ—É–Ω–∫—Ü–∏—é interactive_search()")


üöÄ –ë–´–°–¢–†–´–ô –ü–û–ò–°–ö - –¢–ï–°–¢–û–í–´–ï –ü–†–ò–ú–ï–†–´

üîç –ë–´–°–¢–†–´–ô –ü–û–ò–°–ö: '–¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π –¢-119'
üéØ –õ–£–ß–®–ò–ï –°–û–û–¢–í–ï–¢–°–¢–í–ò–Ø:

1. üìç –û—Ü–µ–Ω–∫–∞: 100%
   ID: 2211674
   –û–ø–∏—Å–∞–Ω–∏–µ: –¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫
   ‚úÖ –ï—Å—Ç—å –≤ TMC: 1 –∑–∞–ø–∏—Å–µ–π

2. üìç –û—Ü–µ–Ω–∫–∞: 100%
   ID: 2218819
   –û–ø–∏—Å–∞–Ω–∏–µ: –¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π
   ‚úÖ –ï—Å—Ç—å –≤ TMC: 1 –∑–∞–ø–∏—Å–µ–π

3. üìç –û—Ü–µ–Ω–∫–∞: 97%
   ID: 1471209
   –û–ø–∏—Å–∞–Ω–∏–µ: –¢–µ–ø–ª–æ–æ–±–º–µ–Ω–Ω–∏–∫ –∫–æ–∂—É—Ö–æ—Ç—Ä—É–±—á–∞—Ç—ã–π –¢-131
   ‚úÖ –ï—Å—Ç—å –≤ TMC: 1 –∑–∞–ø–∏—Å–µ–π

--------------------------------------------------

üîç –ë–´–°–¢–†–´–ô –ü–û–ò–°–ö: '–¢—Ä—É–±–∞ –æ–±—Å–∞–¥–Ω–∞—è 114'
üéØ –õ–£–ß–®–ò–ï –°–û–û–¢–í–ï–¢–°–¢–í–ò–Ø:

1. üìç –û—Ü–µ–Ω–∫–∞: 100%
   ID: 3189715
   –û–ø–∏—Å–∞–Ω–∏–µ: –¢—Ä—É–±–∞ –æ–±—Å–∞–¥–Ω–∞—è –û–¢–¢–ú 114,3—Ö6,35-–î—Å –¢–£ 14-3–†-30-2015 —Å –º—É—Ñ—Ç–∞–º–∏
   ‚ùå –