# NLP Pipeline - Комплексный пайплайн для обработки текста

Пайплайн включает:
- Text Preprocessing
- Feature Extraction (TF-IDF, Word2Vec, BERT)
- Text Classification
- NER (Named Entity Recognition)
- Sentiment Analysis
- Text Generation

In [None]:
!pip install pandas numpy scikit-learn transformers torch nltk spacy gensim textblob -q
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import re
import string
from typing import List, Dict

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Transformers
import torch
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification,
    pipeline, Trainer, TrainingArguments
)

# Sentiment
from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

# Загрузка NLTK данных
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Загрузка spaCy модели
nlp = spacy.load('en_core_web_sm')

print("✓ Библиотеки загружены!")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Загрузка данных

In [None]:
# === ВАШИ ДАННЫЕ ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

TEXT_COL = 'text'
TARGET_COL = 'label'
ID_COL = 'id'

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nПервые строки:")
print(train_df.head())

# Распределение классов
if TARGET_COL in train_df.columns:
    print(f"\nРаспределение классов:")
    print(train_df[TARGET_COL].value_counts())

## 2. Text Preprocessing

In [None]:
class TextPreprocessor:
    """
    Комплексная предобработка текста
    """
    def __init__(self, 
                 lowercase=True,
                 remove_punctuation=True,
                 remove_numbers=False,
                 remove_stopwords=True,
                 lemmatize=True,
                 stem=False,
                 language='english'):
        
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.stem = stem
        
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
    
    def clean_text(self, text: str) -> str:
        """Базовая очистка текста"""
        # Lowercase
        if self.lowercase:
            text = text.lower()
        
        # Удаление URL
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Удаление email
        text = re.sub(r'\S+@\S+', '', text)
        
        # Удаление HTML тегов
        text = re.sub(r'<.*?>', '', text)
        
        # Удаление mentions (@username)
        text = re.sub(r'@\w+', '', text)
        
        # Удаление hashtags
        text = re.sub(r'#\w+', '', text)
        
        # Удаление чисел
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        # Удаление пунктуации
        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Удаление множественных пробелов
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_and_process(self, text: str) -> List[str]:
        """Токенизация и обработка"""
        # Токенизация
        tokens = word_tokenize(text)
        
        # Удаление stopwords
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stop_words]
        
        # Лемматизация
        if self.lemmatize:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        
        # Стемминг
        if self.stem:
            tokens = [self.stemmer.stem(t) for t in tokens]
        
        return tokens
    
    def preprocess(self, text: str, return_tokens=False) -> str:
        """Полная предобработка"""
        text = self.clean_text(text)
        tokens = self.tokenize_and_process(text)
        
        if return_tokens:
            return tokens
        return ' '.join(tokens)

# Создание препроцессора
preprocessor = TextPreprocessor(
    lowercase=True,
    remove_punctuation=True,
    remove_stopwords=True,
    lemmatize=True
)

# Применение к данным
print("\nПрименение предобработки...")
train_df['text_clean'] = train_df[TEXT_COL].apply(preprocessor.preprocess)
test_df['text_clean'] = test_df[TEXT_COL].apply(preprocessor.preprocess)

print("\n✓ Предобработка завершена!")
print(f"\nПример:\nОригинал: {train_df[TEXT_COL].iloc[0]}")
print(f"Обработанный: {train_df['text_clean'].iloc[0]}")

## 3. Feature Extraction - TF-IDF

In [None]:
# TF-IDF векторизация
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),  # uni-grams, bi-grams, tri-grams
    min_df=2,
    max_df=0.9
)

X_train_tfidf = tfidf.fit_transform(train_df['text_clean'])
X_test_tfidf = tfidf.transform(test_df['text_clean'])
y_train = train_df[TARGET_COL]

print(f"TF-IDF shape: Train {X_train_tfidf.shape}, Test {X_test_tfidf.shape}")
print(f"\nТоп-20 TF-IDF признаков:")
feature_names = tfidf.get_feature_names_out()
print(feature_names[:20])

## 4. Feature Extraction - Word2Vec

In [None]:
# Подготовка данных для Word2Vec
train_tokens = train_df['text_clean'].apply(lambda x: x.split()).tolist()
test_tokens = test_df['text_clean'].apply(lambda x: x.split()).tolist()

# Обучение Word2Vec
w2v_model = Word2Vec(
    sentences=train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)

def get_w2v_features(tokens, model, vector_size=100):
    """Усреднение Word2Vec векторов для документа"""
    vectors = []
    for token in tokens:
        if token in model.wv:
            vectors.append(model.wv[token])
    
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    return np.zeros(vector_size)

X_train_w2v = np.array([get_w2v_features(tokens, w2v_model) for tokens in train_tokens])
X_test_w2v = np.array([get_w2v_features(tokens, w2v_model) for tokens in test_tokens])

print(f"✓ Word2Vec готов! Shape: Train {X_train_w2v.shape}, Test {X_test_w2v.shape}")

## 5. Feature Extraction - BERT Embeddings

In [None]:
# Загрузка BERT модели
bert_model_name = 'bert-base-uncased'  # или 'distilbert-base-uncased' для скорости
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)
bert_model.eval()

def get_bert_embedding(text, tokenizer, model, max_length=128):
    """Получение BERT embedding для текста"""
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Используем [CLS] token embedding
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embedding[0]

# Извлечение embeddings (для примера - первые 100 строк)
print("Извлечение BERT embeddings...")
sample_size = min(100, len(train_df))
X_train_bert_sample = np.array([
    get_bert_embedding(text, tokenizer, bert_model) 
    for text in train_df[TEXT_COL][:sample_size]
])

print(f"✓ BERT embeddings готовы! Shape: {X_train_bert_sample.shape}")
print("Для полного датасета используйте батч-обработку")

## 6. Text Classification - Traditional ML

In [None]:
# Разбиение на train/val
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_tfidf, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Модели
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
}

results = {}
for name, model in models.items():
    print(f"\nОбучение {name}...")
    model.fit(X_tr, y_tr)
    
    # Предсказания
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    results[name] = acc
    
    print(f"{name} Validation Accuracy: {acc:.4f}")
    print(f"\nClassification Report:\n{classification_report(y_val, val_pred)}")

print("\n" + "="*60)
print("СРАВНЕНИЕ МОДЕЛЕЙ")
print("="*60)
for name, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {acc:.4f}")

## 7. Text Classification - BERT Fine-tuning

In [None]:
from torch.utils.data import Dataset as TorchDataset
from sklearn.preprocessing import LabelEncoder

# Энкодинг меток
label_encoder = LabelEncoder()
train_df['label_encoded'] = label_encoder.fit_transform(train_df[TARGET_COL])
num_labels = len(label_encoder.classes_)

class TextClassificationDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Создание datasets
train_texts = train_df[TEXT_COL].tolist()[:1000]  # Для примера - первые 1000
train_labels = train_df['label_encoded'].tolist()[:1000]

train_dataset = TextClassificationDataset(
    train_texts, train_labels, tokenizer
)

print(f"✓ Dataset готов! Размер: {len(train_dataset)}")

In [None]:
# Загрузка модели для классификации
bert_classifier = AutoModelForSequenceClassification.from_pretrained(
    bert_model_name,
    num_labels=num_labels
).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir='./bert_classifier',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    logging_steps=50,
    save_strategy='epoch',
    evaluation_strategy='epoch'
)

# Trainer
trainer = Trainer(
    model=bert_classifier,
    args=training_args,
    train_dataset=train_dataset
)

print("\nЗапуск обучения BERT...")
# trainer.train()  # Раскомментируйте для обучения
print("Обучение закомментировано. Раскомментируйте для запуска.")

## 8. Named Entity Recognition (NER)

In [None]:
# NER с spaCy
def extract_entities(text):
    """Извлечение именованных сущностей"""
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Пример
sample_text = train_df[TEXT_COL].iloc[0]
entities = extract_entities(sample_text)

print(f"Текст: {sample_text}\n")
print("Найденные сущности:")
for entity, label in entities:
    print(f"  {entity} -> {label}")

# NER с transformers
ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

ner_results = ner_pipeline(sample_text)
print("\nNER с BERT:")
for entity in ner_results:
    print(f"  {entity['word']} -> {entity['entity_group']} (score: {entity['score']:.3f})")

## 9. Sentiment Analysis

In [None]:
# Sentiment с TextBlob
def get_sentiment_textblob(text):
    """Анализ sentiment с TextBlob"""
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # -1 (negative) to 1 (positive)
    subjectivity = blob.sentiment.subjectivity  # 0 (objective) to 1 (subjective)
    
    if polarity > 0:
        sentiment = 'positive'
    elif polarity < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    
    return {
        'sentiment': sentiment,
        'polarity': polarity,
        'subjectivity': subjectivity
    }

# Sentiment с transformers
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# Примеры
sample_texts = [
    "This is an amazing product! I love it!",
    "Terrible experience. Would not recommend.",
    "It's okay, nothing special."
]

print("Sentiment Analysis:\n")
for text in sample_texts:
    textblob_result = get_sentiment_textblob(text)
    transformer_result = sentiment_pipeline(text)[0]
    
    print(f"Text: {text}")
    print(f"  TextBlob: {textblob_result['sentiment']} (polarity: {textblob_result['polarity']:.2f})")
    print(f"  Transformer: {transformer_result['label']} (score: {transformer_result['score']:.3f})")
    print()

## 10. Additional Text Features

In [None]:
def extract_text_features(text):
    """
    Извлечение статистических признаков текста
    """
    features = {}
    
    # Длина текста
    features['char_count'] = len(text)
    features['word_count'] = len(text.split())
    features['sentence_count'] = len(sent_tokenize(text))
    
    # Средние длины
    words = text.split()
    features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0
    
    # Количество знаков препинания
    features['punctuation_count'] = sum([1 for c in text if c in string.punctuation])
    
    # Количество заглавных букв
    features['upper_count'] = sum([1 for c in text if c.isupper()])
    
    # Sentiment
    blob = TextBlob(text)
    features['polarity'] = blob.sentiment.polarity
    features['subjectivity'] = blob.sentiment.subjectivity
    
    # Уникальные слова
    features['unique_word_ratio'] = len(set(words)) / len(words) if words else 0
    
    return features

# Применение к датасету
print("Извлечение дополнительных признаков...")
text_features_train = train_df[TEXT_COL].apply(extract_text_features)
text_features_test = test_df[TEXT_COL].apply(extract_text_features)

# Преобразование в DataFrame
features_train_df = pd.DataFrame(text_features_train.tolist())
features_test_df = pd.DataFrame(text_features_test.tolist())

print(f"\n✓ Признаки извлечены! Shape: {features_train_df.shape}")
print(f"\nПризнаки: {features_train_df.columns.tolist()}")

## 11. Ensemble с разными фичами

In [None]:
from scipy.sparse import hstack

# Комбинирование признаков
# TF-IDF + Word2Vec + Text features
X_train_combined = hstack([
    X_train_tfidf,
    X_train_w2v,
    features_train_df.values
])

X_test_combined = hstack([
    X_test_tfidf,
    X_test_w2v,
    features_test_df.values
])

print(f"Combined features shape: Train {X_train_combined.shape}, Test {X_test_combined.shape}")

# Обучение на комбинированных признаках
lr_combined = LogisticRegression(max_iter=1000, random_state=42)
lr_combined.fit(X_train_combined, y_train)

# Предсказания
test_predictions = lr_combined.predict(X_test_combined)

print("\n✓ Модель на комбинированных признаках обучена!")

## 12. Submission

In [None]:
# Используем лучшую модель
best_model = models['Logistic Regression']  # Замените на лучшую
final_predictions = best_model.predict(X_test_tfidf)

# Декодирование меток если нужно
# final_predictions = label_encoder.inverse_transform(final_predictions)

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    'prediction': final_predictions
})

submission.to_csv('nlp_submission.csv', index=False)
print("\n✓ Submission сохранен!")
print(submission.head())
print(f"\nРаспределение предсказаний:")
print(submission['prediction'].value_counts())