In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
import matplotlib.pyplot as plt
import re
from collections import Counter
import json

In [3]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')
val = pd.read_csv('val_dataset.csv')

In [2]:
# подготовка данных
def preprocess_text(text):
    """Базовая предобработка текста"""
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text
    return ""

In [4]:
# применяем предобработку
for df in [train, val, test]:
    df['text'] = df['text'].apply(preprocess_text)

In [5]:
# кодируем метки
label_encoder = LabelEncoder()
all_labels = pd.concat([train['cefr_level'], val['cefr_level'], test['cefr_level']])
label_encoder.fit(all_labels)

train['label'] = label_encoder.transform(train['cefr_level'])
val['label'] = label_encoder.transform(val['cefr_level'])
test['label'] = label_encoder.transform(test['cefr_level'])

num_classes = len(label_encoder.classes_)
print(f"Number of classes: {num_classes}")
print(f"Classes: {label_encoder.classes_}")

Number of classes: 6
Classes: ['A1' 'A2' 'B1' 'B2' 'C1' 'C2']


In [6]:
# создаем словарь
def build_vocab(texts, max_vocab_size=20000):
    counter = Counter()
    for text in texts:
        tokens = text.split()
        counter.update(tokens)

    vocab = {'<PAD>': 0, '<UNK>': 1}
    for idx, (word, count) in enumerate(counter.most_common(max_vocab_size - 2)):
        vocab[word] = idx + 2

    return vocab

In [7]:
vocab = build_vocab(train['text'], max_vocab_size=40000)
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 40000


In [8]:
# dataset класс
class TextDataset(Dataset):
    def __init__(self, df, vocab, max_length=256):
        self.df = df
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['label']

        # токенизация и преобразование в индексы
        tokens = text.split()[:self.max_length]
        indices = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # паддинг
        if len(indices) < self.max_length:
            indices = indices + [self.vocab['<PAD>']] * (self.max_length - len(indices))
        else:
            indices = indices[:self.max_length]

        return {
            'text': torch.tensor(indices, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
# создаем даталоадеры
max_length = 256
batch_size = 32

train_dataset = TextDataset(train, vocab, max_length)
val_dataset = TextDataset(val, vocab, max_length)
test_dataset = TextDataset(test, vocab, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
# обучение модели
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    train_losses = []
    val_losses = []
    val_accuracies = []

    for epoch in range(num_epochs):
        # обучение
        model.train()
        train_loss = 0
        for batch in train_loader:
            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # валидация
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                texts = batch['text'].to(device)
                labels = batch['label'].to(device)

                outputs = model(texts)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        train_losses.append(train_loss / len(train_loader))
        val_losses.append(val_loss / len(val_loader))
        val_acc = correct / total
        val_accuracies.append(val_acc)

        # для логирования метрик каждую эпоху
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_acc:.2f}')

    return train_losses, val_losses, val_accuracies

## CNN

In [11]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, num_classes=6):
        super(TextCNN, self).__init__()

        # слой эмбеддингов
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # несколько свёрток с разными ядрами
        self.conv1 = nn.Conv1d(embedding_dim, 100, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embedding_dim, 100, kernel_size=4, padding=2)
        self.conv3 = nn.Conv1d(embedding_dim, 100, kernel_size=5, padding=2)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, num_classes)

        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU()

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)

        # применяем свёртки
        conv1 = self.relu(self.conv1(embedded))
        conv2 = self.relu(self.conv2(embedded))
        conv3 = self.relu(self.conv3(embedded))

        # применяем пуллинг
        pooled1 = self.pool(conv1).squeeze(-1)
        pooled2 = self.pool(conv2).squeeze(-1)
        pooled3 = self.pool(conv3).squeeze(-1)

        # объединяем фичи
        concatenated = torch.cat((pooled1, pooled2, pooled3), dim=1)

        # классификация
        concatenated = self.dropout(concatenated)
        output = self.fc(concatenated)

        return output

In [12]:
# создаём и обучаем модель
cnn_model = TextCNN(vocab_size, embedding_dim=256, num_classes=num_classes)
train_losses, val_losses, val_accuracies = train_model(cnn_model, train_loader, val_loader, num_epochs=10)

Epoch 1/10:
Train Loss: 1.7262, Val Loss: 1.4937, Val Acc: 0.40
Epoch 2/10:
Train Loss: 1.3870, Val Loss: 1.3914, Val Acc: 0.44
Epoch 3/10:
Train Loss: 1.0959, Val Loss: 1.3266, Val Acc: 0.48
Epoch 4/10:
Train Loss: 0.8070, Val Loss: 1.2999, Val Acc: 0.51
Epoch 5/10:
Train Loss: 0.5874, Val Loss: 1.3608, Val Acc: 0.52
Epoch 6/10:
Train Loss: 0.4171, Val Loss: 1.4917, Val Acc: 0.53
Epoch 7/10:
Train Loss: 0.3078, Val Loss: 1.6042, Val Acc: 0.55
Epoch 8/10:
Train Loss: 0.2459, Val Loss: 1.7174, Val Acc: 0.55
Epoch 9/10:
Train Loss: 0.1962, Val Loss: 1.8655, Val Acc: 0.54
Epoch 10/10:
Train Loss: 0.1637, Val Loss: 2.0154, Val Acc: 0.55


In [13]:
def predict(model, dataloader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for batch in test_loader:
            texts = batch['text'].to(device)

            outputs = model(texts)

            probabilities = torch.nn.functional.softmax(outputs, dim=1)

            _, predicted = torch.max(outputs.data, 1)

            all_predictions.extend(predicted.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())

    return all_predictions, all_probabilities

In [14]:
test_predictions, test_probabilities = predict(cnn_model, test_loader)

In [24]:
print(f'''Accuracy: {accuracy_score(test["label"], test_predictions)}
F1: {f1_score(test["label"], test_predictions, average="macro")}''')

Accuracy: 0.5376181474480152
F1: 0.535907922018339


In [25]:
report_data = classification_report(test['label'],
                                    test_predictions,
                                    target_names=label_encoder.classes_,
                                    digits=4,
                                    output_dict=True)

In [28]:
with open('classification_report.json', 'w', encoding='utf-8') as f:
    json.dump(report_data, f, indent=2, ensure_ascii=False)