In [9]:
import flask_migrate

In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

article_en = "The Secretary-General of the United Nations says there is no military solution in Syria."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate English to German
tokenizer.src_lang = "en_XX"
encoded_en = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_en,
    forced_bos_token_id=tokenizer.lang_code_to_id["de_DE"]
)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(translated_text)

  from .autonotebook import tqdm as notebook_tqdm


['Der Generalsekretär der Vereinten Nationen sagt, es gibt keine militärische Lösung in Syrien.']


In [2]:
import pandas as pd
import pymysql

# 데이터 베이스 연결하기
conn = pymysql.connect(host='1.251.203.204',
                       user='root',
                       password='kdt5',
                       db='Team4',
                       charset='utf8',
                       port=33065)

curs = conn.cursor()

# 검색 명령어 사용 
sql = "SELECT en.text as en, de.text as de FROM language_en en join language_de de on en.id = de.id;"
curs.execute(sql)
result = curs.fetchall()
print("현재 테이블의 데이터수는 총 {}개 입니다.".format(len(result)))
endeDF = pd.read_sql(sql, conn)

# 데이터베이스 연결 종료
conn.close()
endeDF.head()

현재 테이블의 데이터수는 총 31102개 입니다.


  endeDF = pd.read_sql(sql, conn)


Unnamed: 0,en,de
0,In the beginning God created the heaven and th...,Am Anfang schuf Gott Himmel und Erde.
1,"And the earth was without form, and void; and ...","Und die Erde war wüst und leer, und es war fin..."
2,"And God said, Let there be light: and there wa...",Und Gott sprach: Es werde Licht! und es ward L...
3,"And God saw the light, that it was good: and G...","Und Gott sah, daß das Licht gut war. Da schied..."
4,"And God called the light Day, and the darkness...",und nannte das Licht Tag und die Finsternis Na...


In [5]:
source_texts = endeDF["en"].tolist()
target_texts = endeDF["de"].tolist()

In [8]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

# 데이터셋 클래스 정의
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length=512):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.source_texts)
    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        target_text = self.target_texts[idx]
        
        encoding = self.tokenizer(
            source_text,
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        encoding["labels"] = target_encoding["input_ids"]
        
        return encoding



# 파인튜닝에 사용할 데이터 준비
source_texts = endeDF["en"].tolist()
target_texts = endeDF["de"].tolist()

# 모델과 토크나이저 불러오기
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# 데이터셋 생성
dataset = TranslationDataset(source_texts, target_texts, tokenizer)

# 파라미터 설정
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 0.01

# DataLoader 생성
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# 손실 함수 설정
loss_fn = torch.nn.CrossEntropyLoss()

# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 모델 파인튜닝
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].squeeze(1)
        attention_mask = batch["attention_mask"].squeeze(1)
        labels = batch["labels"].squeeze(1)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

# 파인튜닝된 모델 저장
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

Epoch 1:   0%|          | 0/1944 [01:57<?, ?it/s]


KeyboardInterrupt: 