In [1]:
import pandas as pd




df = pd.read_csv('../../data/output/data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../../data/output/data.csv'

In [13]:
df['path']  = df['path'].apply(lambda x: '..\\..\\data\\' + x)

In [14]:
df

Unnamed: 0,formula,path,selection_type
0,1,..\..\data\output\train\formula_4.png,train
1,{u}^{1},..\..\data\output\train\formula_10.png,train
2,\left( (b) \right),..\..\data\output\train\formula_8.png,train
3,(\log(a)) \cdot {\pi}^{6} - \pi,..\..\data\output\train\formula_7.png,train
4,y + c + 0 - ( z \cdot b ),..\..\data\output\train\formula_6.png,train
...,...,...,...
1465,\frac{d}{dv} (2),..\..\data\output\test\formula_66.png,test
1466,\frac{d}{da} ({e}^{u}),..\..\data\output\test\formula_69.png,test
1467,\frac{d}{da} 3,..\..\data\output\test\formula_68.png,test
1468,\frac{d^2}{dz^2} \left( u \right),..\..\data\output\test\formula_67.png,test


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np

train_df = df[df['selection_type'] == 'train']
test_df = df[df['selection_type'] == 'test']

# Сбор уникальных символов LaTeX
all_chars = set()
for formula in train_df['formula']:
    all_chars.update(list(formula))
chars = sorted(all_chars)
char_to_idx = {char: idx+1 for idx, char in enumerate(chars)}  # 0 зарезервирован для CTC
char_to_idx['<BLANK>'] = 0  # CTC blank label
idx_to_char = {v: k for k, v in char_to_idx.items()}
vocab_size = len(char_to_idx)

# 2. Предобработка изображений
IMG_HEIGHT = 32
IMG_WIDTH = 128

transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((IMG_HEIGHT, IMG_WIDTH)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# 3. Кастомный Dataset
class MathDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['path']
        formula = self.df.iloc[idx]['formula']
        
        # Загрузка и трансформация изображения
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        # Преобразование формулы в последовательность индексов
        label = [char_to_idx[c] for c in formula]
        label_length = len(label)
        
        return {
            'image': image,
            'label': torch.tensor(label, dtype=torch.long),
            'label_length': label_length
        }

# 4. DataLoader
def collate_fn(batch):
    images = [item['image'] for item in batch]
    labels = [item['label'] for item in batch]
    label_lengths = [item['label_length'] for item in batch]
    
    # Пакетирование изображений
    images = torch.stack(images)
    
    # Пакетирование меток для CTC
    max_len = max(label_lengths)
    padded_labels = torch.full((len(labels), max_len), 0, dtype=torch.long)
    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label
    
    return {
        'images': images,
        'labels': padded_labels,
        'label_lengths': torch.tensor(label_lengths),
        'image_lengths': torch.tensor([IMG_WIDTH // 4] * len(images))  # Пример для CTC
    }

train_dataset = MathDataset(train_df, transform=transform)
train_loader = DataLoader(
    train_dataset, 
    batch_size=8, 
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2
)

# 5. Модель CRNN
class CRNN(nn.Module):
    def __init__(self, img_channels, vocab_size):
        super(CRNN, self).__init__()
        # CNN экстрактор признаков
        self.cnn = nn.Sequential(
            nn.Conv2d(img_channels, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 1)
        )
        
        # RNN последовательность
        self.rnn = nn.LSTM(
            input_size=128 * (IMG_HEIGHT // 8),
            hidden_size=256,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )
        
        self.fc = nn.Linear(512, vocab_size)  # 256*2 для bidirectional
    
    def forward(self, x):
        # CNN feature extraction
        x = self.cnn(x)  # (batch, channels, h, w)
        
        # Преобразование для RNN: (batch, seq_len, features)
        batch, channels, height, width = x.size()
        x = x.view(batch, channels * height, width)
        x = x.permute(0, 2, 1)  # (batch, width, features)
        
        # RNN processing
        x, _ = self.rnn(x)
        
        # Fully connected layer
        x = self.fc(x)  # (batch, width, vocab_size)
        x = x.permute(1, 0, 2)  # CTC требует (seq_len, batch, num_classes)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CRNN(img_channels=1, vocab_size=vocab_size).to(device)
criterion = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 6. Обучение
def train_epoch():
    model.train()
    total_loss = 0
    for batch in train_loader:
        images = batch['images'].to(device)
        labels = batch['labels'].to(device)
        label_lengths = batch['label_lengths'].to(device)
        image_lengths = batch['image_lengths'].to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        
        # CTC Loss требует специального форматирования
        loss = criterion(
            outputs, 
            labels, 
            image_lengths, 
            label_lengths
        )
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Пример запуска обучения
for epoch in range(5):
    loss = train_epoch()
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

# 7. Инференс
def predict(image_path):
    model.eval()
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(image)
        output = output.permute(1, 0, 2)  # (batch, seq_len, num_classes)
        _, preds = output.max(2)
    
    # Декодирование предсказания (упрощенный greedy decoding)
    pred_str = ''
    prev_char = ''
    for char_idx in preds[0]:
        char = idx_to_char.get(char_idx.item(), '')
        if char != prev_char and char != '<BLANK>':
            pred_str += char
        prev_char = char
    
    return pred_str

# Пример использования
# predicted_formula = predict('test_image.png')