# Text Similarity Classification - 4 уровня релевантности

Классификация пар текстов на 4 класса:
- `relevant_plus` - очень релевантные
- `relevant` - релевантные
- `relevant_minus` - слабо релевантные
- `no_relevant` - нерелевантные

Методы:
- Sentence Transformers embeddings
- Cosine similarity features
- Text statistics
- CatBoost / XGBoost classification

In [None]:
!pip install sentence-transformers pandas numpy scikit-learn catboost xgboost transformers torch -q

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import xgboost as xgb
import torch
import re
import warnings
warnings.filterwarnings('ignore')

print("✓ Библиотеки загружены!")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Загрузка данных

In [None]:
# === ВАШИ ДАННЫЕ ===
# Формат: 'text1', 'text2', 'label'
# label: relevant_plus, relevant, relevant_minus, no_relevant
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nРаспределение классов:")
print(train_df['label'].value_counts())
print(f"\nПример:")
print(train_df.head())

## 2. Кодирование меток

In [None]:
# Label encoding с правильным порядком (по степени релевантности)
label_mapping = {
    'no_relevant': 0,
    'relevant_minus': 1,
    'relevant': 2,
    'relevant_plus': 3
}

train_df['label_encoded'] = train_df['label'].map(label_mapping)

print("Маппинг меток:")
for label, code in label_mapping.items():
    print(f"{code}: {label}")

print(f"\nРаспределение закодированных меток:")
print(train_df['label_encoded'].value_counts().sort_index())

## 3. Загрузка Sentence Transformer

In [None]:
# Выбор модели
MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'  # мультиязычная
# MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'  # английский, качественная
# MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'  # английский, быстрая

model = SentenceTransformer(MODEL_NAME)
print(f"✓ Модель загружена: {MODEL_NAME}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

## 4. Feature Engineering - Text Statistics

In [None]:
def extract_text_features(text):
    """
    Извлечение статистических признаков из текста
    """
    text = str(text)
    
    features = {
        'length': len(text),
        'word_count': len(text.split()),
        'char_count': len(text),
        'avg_word_length': np.mean([len(word) for word in text.split()]) if len(text.split()) > 0 else 0,
        'digit_count': sum(c.isdigit() for c in text),
        'upper_count': sum(c.isupper() for c in text),
        'space_count': text.count(' '),
        'punctuation_count': sum(c in '.,!?;:' for c in text),
        'unique_words': len(set(text.split())),
    }
    
    return features

def extract_pair_features(text1, text2):
    """
    Признаки для пары текстов
    """
    text1, text2 = str(text1), str(text2)
    
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    # Пересечение слов
    common_words = words1.intersection(words2)
    
    features = {
        'length_diff': abs(len(text1) - len(text2)),
        'length_ratio': len(text1) / (len(text2) + 1),
        'word_count_diff': abs(len(text1.split()) - len(text2.split())),
        'common_words_count': len(common_words),
        'common_words_ratio': len(common_words) / (len(words1.union(words2)) + 1),
        'jaccard_similarity': len(common_words) / (len(words1.union(words2)) + 1),
    }
    
    return features

# Применение
print("Извлечение текстовых признаков...")

# Для text1
text1_features = train_df['text1'].apply(extract_text_features).apply(pd.Series)
text1_features.columns = [f'text1_{col}' for col in text1_features.columns]

# Для text2
text2_features = train_df['text2'].apply(extract_text_features).apply(pd.Series)
text2_features.columns = [f'text2_{col}' for col in text2_features.columns]

# Признаки пары
pair_features = train_df.apply(
    lambda row: extract_pair_features(row['text1'], row['text2']), axis=1
).apply(pd.Series)

# Объединение
train_features = pd.concat([text1_features, text2_features, pair_features], axis=1)

print(f"✓ Создано {len(train_features.columns)} текстовых признаков")
print(train_features.head())

## 5. Генерация эмбеддингов

In [None]:
print("Генерация эмбеддингов для train...")

train_text1 = train_df['text1'].astype(str).tolist()
train_text2 = train_df['text2'].astype(str).tolist()

# Batch encoding для скорости
embeddings1_train = model.encode(train_text1, convert_to_numpy=True, show_progress_bar=True, batch_size=32)
embeddings2_train = model.encode(train_text2, convert_to_numpy=True, show_progress_bar=True, batch_size=32)

print(f"\n✓ Эмбеддинги сгенерированы!")
print(f"Shape: {embeddings1_train.shape}")

## 6. Similarity Features

In [None]:
def compute_similarity_features(emb1, emb2):
    """
    Вычисление различных метрик похожести
    """
    features = {}
    
    # Cosine similarity
    features['cosine_sim'] = cosine_similarity([emb1], [emb2])[0][0]
    
    # Euclidean distance
    features['euclidean_dist'] = euclidean_distances([emb1], [emb2])[0][0]
    
    # Manhattan distance
    features['manhattan_dist'] = np.sum(np.abs(emb1 - emb2))
    
    # Dot product
    features['dot_product'] = np.dot(emb1, emb2)
    
    # Element-wise difference statistics
    diff = emb1 - emb2
    features['diff_mean'] = np.mean(diff)
    features['diff_std'] = np.std(diff)
    features['diff_max'] = np.max(np.abs(diff))
    
    # Element-wise product statistics
    prod = emb1 * emb2
    features['prod_mean'] = np.mean(prod)
    features['prod_std'] = np.std(prod)
    
    return features

# Вычисление similarity признаков
print("Вычисление similarity признаков...")

similarity_features = []
for emb1, emb2 in zip(embeddings1_train, embeddings2_train):
    sim_feats = compute_similarity_features(emb1, emb2)
    similarity_features.append(sim_feats)

similarity_features_df = pd.DataFrame(similarity_features)

print(f"✓ Создано {len(similarity_features_df.columns)} similarity признаков")
print(similarity_features_df.head())

## 7. Объединение всех признаков

In [None]:
# Эмбеддинги как признаки (опционально, может быть много)
# Используем разницу и конкатенацию эмбеддингов
embedding_diff = embeddings1_train - embeddings2_train
embedding_concat = np.concatenate([embeddings1_train, embeddings2_train], axis=1)

# Преобразуем в DataFrame
embedding_diff_df = pd.DataFrame(
    embedding_diff, 
    columns=[f'emb_diff_{i}' for i in range(embedding_diff.shape[1])]
)

# Можно добавить и concatenated embeddings (но это удвоит размерность)
# embedding_concat_df = pd.DataFrame(
#     embedding_concat, 
#     columns=[f'emb_concat_{i}' for i in range(embedding_concat.shape[1])]
# )

# Объединяем все признаки
X = pd.concat([
    train_features.reset_index(drop=True),
    similarity_features_df.reset_index(drop=True),
    embedding_diff_df.reset_index(drop=True)
], axis=1)

y = train_df['label_encoded'].values

print(f"\n✓ Финальный набор признаков: {X.shape}")
print(f"Количество классов: {len(np.unique(y))}")
print(f"Признаки: {X.columns.tolist()[:20]}...")

## 8. Train/Val split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")
print(f"\nРаспределение классов в train:")
print(pd.Series(y_train).value_counts().sort_index())

## 9. Обучение CatBoost

In [None]:
# CatBoost для многоклассовой классификации
catboost_params = {
    'iterations': 1500,
    'learning_rate': 0.05,
    'depth': 7,
    'loss_function': 'MultiClass',
    'eval_metric': 'TotalF1',  # или 'Accuracy', 'AUC:type=Mu'
    'random_seed': 42,
    'verbose': 200,
    'early_stopping_rounds': 100,
    'task_type': 'GPU' if cb.cuda.is_cuda_available() else 'CPU'
}

cat_model = cb.CatBoostClassifier(**catboost_params)

print("Обучение CatBoost...\n")
cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    verbose=200
)

print("\n✓ CatBoost обучен!")

## 10. Оценка модели

In [None]:
# Предсказания
y_pred = cat_model.predict(X_val).flatten()
y_pred_proba = cat_model.predict_proba(X_val)

# Метрики
accuracy = accuracy_score(y_val, y_pred)
f1_macro = f1_score(y_val, y_pred, average='macro')
f1_weighted = f1_score(y_val, y_pred, average='weighted')

print("\n" + "="*60)
print("РЕЗУЛЬТАТЫ ВАЛИДАЦИИ")
print("="*60)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score (macro): {f1_macro:.4f}")
print(f"F1-score (weighted): {f1_weighted:.4f}")
print("="*60)

# Classification report
print("\nClassification Report:")
target_names = ['no_relevant', 'relevant_minus', 'relevant', 'relevant_plus']
print(classification_report(y_val, y_pred, target_names=target_names))

# Confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## 11. Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': cat_model.get_feature_importance()
}).sort_values('importance', ascending=False)

print("\nТоп-20 важных признаков:")
print(feature_importance.head(20))

plt.figure(figsize=(10, 8))
plt.barh(feature_importance.head(20)['feature'], feature_importance.head(20)['importance'])
plt.xlabel('Importance')
plt.title('Топ-20 важных признаков')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 12. Подготовка test данных

In [None]:
print("Подготовка test данных...")

# Текстовые признаки
test_text1_features = test_df['text1'].apply(extract_text_features).apply(pd.Series)
test_text1_features.columns = [f'text1_{col}' for col in test_text1_features.columns]

test_text2_features = test_df['text2'].apply(extract_text_features).apply(pd.Series)
test_text2_features.columns = [f'text2_{col}' for col in test_text2_features.columns]

test_pair_features = test_df.apply(
    lambda row: extract_pair_features(row['text1'], row['text2']), axis=1
).apply(pd.Series)

test_features = pd.concat([test_text1_features, test_text2_features, test_pair_features], axis=1)

# Эмбеддинги
print("\nГенерация эмбеддингов для test...")
test_text1 = test_df['text1'].astype(str).tolist()
test_text2 = test_df['text2'].astype(str).tolist()

embeddings1_test = model.encode(test_text1, convert_to_numpy=True, show_progress_bar=True, batch_size=32)
embeddings2_test = model.encode(test_text2, convert_to_numpy=True, show_progress_bar=True, batch_size=32)

# Similarity признаки
test_similarity_features = []
for emb1, emb2 in zip(embeddings1_test, embeddings2_test):
    sim_feats = compute_similarity_features(emb1, emb2)
    test_similarity_features.append(sim_feats)

test_similarity_features_df = pd.DataFrame(test_similarity_features)

# Embedding difference
test_embedding_diff = embeddings1_test - embeddings2_test
test_embedding_diff_df = pd.DataFrame(
    test_embedding_diff,
    columns=[f'emb_diff_{i}' for i in range(test_embedding_diff.shape[1])]
)

# Объединение
X_test = pd.concat([
    test_features.reset_index(drop=True),
    test_similarity_features_df.reset_index(drop=True),
    test_embedding_diff_df.reset_index(drop=True)
], axis=1)

print(f"\n✓ Test данные готовы: {X_test.shape}")

## 13. Предсказания на test

In [None]:
# Предсказания
test_predictions = cat_model.predict(X_test).flatten()
test_predictions_proba = cat_model.predict_proba(X_test)

# Декодирование меток
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
test_predictions_labels = [reverse_label_mapping[pred] for pred in test_predictions]

print("\n✓ Предсказания готовы!")
print(f"\nРаспределение предсказанных классов:")
print(pd.Series(test_predictions_labels).value_counts())

## 14. Submission

In [None]:
submission = pd.DataFrame({
    'id': test_df.index,  # или test_df['id']
    'prediction': test_predictions_labels
})

submission.to_csv('text_similarity_4class_submission.csv', index=False)
print("\n✓ Submission сохранен!")
print(submission.head(10))
print(f"\nИтоговое распределение:")
print(submission['prediction'].value_counts())