In [154]:
# pip install torch torchtext transformers sentencepiece pandas tqdm datasets notebook 
#!pip install tqdm
# https://habr.com/ru/articles/859250/

In [155]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [215]:
data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [216]:
print(data_sample)

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})


In [217]:
updated_data = [{'Name': item['Name'], 
                 'Symptoms': item['Symptoms'], 
                 'Treatments': item['Treatments'],
                } for item in data_sample['train']]
df = pd.DataFrame(updated_data)

In [218]:
df.head()

Unnamed: 0,Name,Symptoms,Treatments
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."


In [219]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [220]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


In [221]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)

In [222]:
BATCH_SIZE = 8

In [315]:
class LanguageDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.labels = df.columns #устанавливаем метки столбцов
        self.data = df.to_dict(orient='records')
        print(self.data[1:3])
        self.tokenizer = tokenizer
        x = self.average_len(df)
        self.max_length = x 

    def average_len(self,df):
        sum_ = 0
        for example in df[self.labels[1]]:
          sum_ += len(example)
        x  = 2
        while x < sum_/len(df):
          x = x * 2
        return x
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
          x = self.data[idx][self.labels[0]]
          y = self.data[idx][self.labels[1]]
          z = self.data[idx][self.labels[2]]
          # text = f"{x} | {y}"
          text = f"{x} | {y} | {z}"
    
          tokens = self.tokenizer.encode_plus(text, 
                                              return_tensors='pt', 
                                              max_length=self.max_length, 
                                              padding='max_length', 
                                              truncation=True) 
            
          return tokens

In [316]:
data_sample = LanguageDataset(df, tokenizer)

[{'Name': 'Vocal cord polyp', 'Symptoms': 'Hoarseness, Vocal Changes, Vocal Fatigue', 'Treatments': 'Voice Rest, Speech Therapy, Surgical Removal'}, {'Name': 'Turner syndrome', 'Symptoms': 'Short stature, Gonadal dysgenesis, Webbed neck, Lymphedema', 'Treatments': 'Growth hormone therapy, Estrogen replacement therapy, Cardiac and renal evaluations'}]


In [317]:
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size

train_data, valid_data = random_split(data_sample, [train_size, valid_size])

In [318]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) #дополнительно перемешаем данные
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

In [319]:
num_epochs = 10

In [320]:
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

In [321]:
optimizer = optim.Adam(model.parameters(), lr=5e-5)
tokenizer.pad_token = tokenizer.eos_token

In [322]:
results = pd.DataFrame(columns=['epoch', 
                                'transformer', 
                                'batch_size', 
                                'gpu',
                                'training_loss', 
                                'validation_loss', 
                                'epoch_duration_sec'])

In [323]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_model_state = None

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict()
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_state = model.state_dict()
            self.counter = 0

    def load_best_model(self, model):
        model.load_state_dict(self.best_model_state)

In [324]:
def train_model(model, num_epochs, train_loader, batch_size, model_name, sheduler, tokenizer):
  for epoch in range(num_epochs):
      start_time = time.time()  # Start the timer for the epoch
      #переводим модель в режим обучения
      model.train()
      epoch_training_loss = 0

      train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")

      for batch in train_iterator:
          optimizer.zero_grad()
          inputs = batch['input_ids'].squeeze(1).to(device)
          targets = inputs.clone()

          outputs = model(input_ids=inputs, labels=targets)

          loss = outputs.loss
          
          #выполняем обратный переход
          loss.backward()
          #обновляем веса
          optimizer.step()

          train_iterator.set_postfix({'Training Loss': loss.item()})
          epoch_training_loss += loss.item()

      avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

      #переводим модель в режим ответов
      model.eval()
      
      epoch_validation_loss = 0
      total_loss = 0
      valid_iterator = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
      with torch.no_grad():
          for batch in valid_iterator:
              inputs = batch['input_ids'].squeeze(1).to(device)
              targets = inputs.clone()
              outputs = model(input_ids=inputs, labels=targets)
              loss = outputs.loss
              total_loss += loss
              valid_iterator.set_postfix({'Validation Loss': loss.item()})
              epoch_validation_loss += loss.item()

      avg_epoch_validation_loss = epoch_validation_loss / len(valid_loader)

      end_time = time.time()  # закончилась одна эпоха
      epoch_duration_sec = end_time - start_time

      new_row = {'transformer': model_name,
                'batch_size': batch_size,
                'gpu': gpu,
                'epoch': epoch+1,
                'training_loss': avg_epoch_training_loss,
                'validation_loss': avg_epoch_validation_loss,
                'epoch_duration_sec': epoch_duration_sec}  

      results.loc[len(results)] = new_row
      print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(valid_loader)}")

      early_stopping(epoch_validation_loss, model)
      if early_stopping.early_stop:
          print("Early stopping")
          break

    # Load the best model
      early_stopping.load_best_model(model)

In [325]:
from torch.optim.lr_scheduler import ExponentialLR

sheduler  =  ExponentialLR(optimizer, gamma=0.8)
early_stopping = EarlyStopping(patience=5, delta=0.01)
train_model(model, num_epochs, train_loader, batch_size, model_name, tokenizer, sheduler)

Training Epoch 1/10 Batch Size: 8, Transformer: distilgpt2: 100%|████████████████████████████████████| 40/40 [00:02<00:00, 13.65it/s, Training Loss=0.68]
Validation Epoch 1/10: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 39.95it/s, Validation Loss=0.207]


Epoch: 1, Validation Loss: 0.2901248037815094


Training Epoch 2/10 Batch Size: 8, Transformer: distilgpt2: 100%|████████████████████████████████████| 40/40 [00:02<00:00, 13.60it/s, Training Loss=0.19]
Validation Epoch 2/10: 100%|███████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 38.27it/s, Validation Loss=0.21]


Epoch: 2, Validation Loss: 0.2988549768924713


Training Epoch 3/10 Batch Size: 8, Transformer: distilgpt2: 100%|███████████████████████████████████| 40/40 [00:02<00:00, 13.68it/s, Training Loss=0.233]
Validation Epoch 3/10: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 38.76it/s, Validation Loss=0.206]


Epoch: 3, Validation Loss: 0.29911547899246216


Training Epoch 4/10 Batch Size: 8, Transformer: distilgpt2: 100%|███████████████████████████████████| 40/40 [00:03<00:00, 13.28it/s, Training Loss=0.357]
Validation Epoch 4/10: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 39.89it/s, Validation Loss=0.217]


Epoch: 4, Validation Loss: 0.30714571475982666


Training Epoch 5/10 Batch Size: 8, Transformer: distilgpt2: 100%|███████████████████████████████████| 40/40 [00:02<00:00, 13.42it/s, Training Loss=0.231]
Validation Epoch 5/10: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 33.17it/s, Validation Loss=0.222]


Epoch: 5, Validation Loss: 0.31564903259277344


Training Epoch 6/10 Batch Size: 8, Transformer: distilgpt2: 100%|████████████████████████████████████| 40/40 [00:03<00:00, 13.17it/s, Training Loss=0.18]
Validation Epoch 6/10: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 37.94it/s, Validation Loss=0.231]

Epoch: 6, Validation Loss: 0.32396718859672546
Early stopping





In [326]:
# input_str = "Cellulitis"
input_str = "Panic disorder "
# input_str = "Eye alignment disorder"

input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)

output = model.generate(
    input_ids,
    max_length=100, # максимальная длина выходной последовательности. 
    # Генерация последовательности будет происходить, пока не будет выбран 
    # токен остановки или пока не будет достигнута максимальная длина
    num_return_sequences=1, # количество возвращаемых ответов
    do_sample=True,
    top_k=10, # количество токенов с наибольшей вероятностью, среди которых будет происходить выбор следующего токена
    top_p=0.8, # вероятность, которую не должна превышать сумма вероятностей наиболее вероятных токенов на каждом шаге.
    temperature=0.91, #отвечает за "креативность" модели
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id,
)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

Panic disorder  | Severe abdominal pain, bloating, nausea, vomiting, fever, rapid heart rate, dizziness or lightheadedness (in severe cases) | Emergency medical attention, medications to manage symptoms, lifestyle modifications (e.g., avoiding alcohol, maintaining a healthy weight), medication (e-cigarettes, diuretics , antihistamines). Anticoagulants
