## Teacher Model

In [37]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

checkpoint = 'C:/Users/CPB06GameN/Downloads/voiceprint/m2m100_418M'
teacher_model = M2M100ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = M2M100Tokenizer.from_pretrained(checkpoint)

In [2]:
model

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0): M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=Tr

## Student Model

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_heads, n_layers, pf_dim, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_emb = nn.Embedding(input_dim, hid_dim)
        self.pos_emb = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim], device=device))
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len, device=self.device).unsqueeze(0).repeat(batch_size, 1)
        pos = self.pos_emb(pos)
        emb = self.tok_emb(src) * self.scale
        src = self.dropout(pos + emb)
        
        for layers in self.layers:
            src = layers(src, src_mask)
        return src

In [27]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.feedforward = PositionWiseFeedForward(hid_dim, pf_dim, dropout)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(self.dropout(_src) + src)
        _src = self.feedforward(src)
        src = self.ff_layer_norm(self.dropout(_src) + src)
        return src

In [35]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = n_heads // hid_dim 
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim], device=device))
    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, slef.n_heads, self.head_dim).permute(0, 2, 1, 3)
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)
        return x, attention

In [29]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

In [30]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_emb = nn.Embedding(output_dim, hid_dim)
        self.pos_emb = nn.Embedding(hid_dim, pf_dim)
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim], device=device))
    def forward(self, tgt, enc_src, tgt_mask, src_mask):
        batch_size = tgt.shape[0]
        tgt_len = tgt.shape[1]
        pos = torch.arange(0, tgt_len, device=self.device).unsqueeze(0).repeat(batch_size, 1)
        tgt = self.dropout((self.tok_emb(tgt) * self.scale) + self.pos_emb(pos))
        
        for layer in self.layers:
            tgt, attention = layer(tgt, enc_src, tgt_mask, src_mask)
        output = self.fc_out(tgt)
        return output, attention

In [37]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.enc_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.feedforward = PositionWiseFeedForward(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, tgt, enc_src, tgt_mask, src_mask):
        _tgt, _ = self.self_attention(tgt, tgt, tgt, tgt_mask)
        tgt = self.self_attn_layer_norm(tgt + self.dropout(_tgt))
        
        _tgt, attention = self.enc_attention(tgt, enc_src, enc_src, src_mask)
        tgt = self.enc_attn_layer_norm(tgt + self.dropout(_tgt))
        
        _tgt = self.feedforward(tgt)
        tgt = self.ff_layer_norm(tgt + self.dropout(_tgt))
        return tgt, attention

In [32]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, tgt_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.tgt_pad_idx = tgt_pad_idx
        self.device = device
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    def make_tgt_mask(self, tgt):
        tgt_mask = (tgt != self.tgt_pad_idx).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.shape[1]
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=self.device)).bool()
        tgt_mask = tgt_mask & tgt_sub_mask
        return tgt_mask
    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(tgt, enc_src, tgt_mask, src_mask)
        return output, attention

In [33]:
# Student Model Config
INPUT_DIM, OUTPUT_DIM = tokenizer.vocab_size, tokenizer.vocab_size
SRC_PAD_IDX, TGT_PAD_IDX = tokenizer.pad_token_id, tokenizer.pad_token_id
HID_DIM = 256
ENC_LAYERS, DEC_LAYERS = 3, 3
ENC_HEADS, DEC_HEADS = 8, 8
ENC_PF_DIM, DEC_PF_DIM = 512, 512
ENC_DROPOUT, DEC_DROPOUT = 0.1, 0.1

In [40]:
device = torch.device('cpu')
enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device)
student_model = Seq2seq(enc, dec, SRC_PAD_IDX, TGT_PAD_IDX, device) #.to(device)

## Data Processing

In [None]:
torch.set_printoptions(precision=8, sci_mode=False)

In [46]:
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import os
import re

class CustomDataset(Dataset):
    def __init__(self, path, tokenizer):
        super().__init__()
        self.path = path + '/'
        self.file_list = os.listdir(path)
        self.tokenizer = tokenizer
    def file_sorter(self):
        names = []
        for file in self.file_list:
            num = re.sub('[^0-9]', '', file)
            names.append((int(num), file))
        names.sort()
        num, file_list = zip(*names)
        return list(file_list)
    def __len__(self):
        files = self.file_sorter()
        last = len(pd.read_json(self.path + files[-1]))
        return ((len(self.file_list) - 1) * 10000) + last
    def __getitem__(self, index):
        for file in self.file_list:
            chunk = pd.read_json(self.path + file)
            self.tokenizer.src_lang = chunk['src_lang'][index]
            self.tokenizer.tgt_lang = chunk['tgt_lang'][index]
            inputs = self.tokenizer(chunk['src_text'][index], return_tensors='pt') #.to(device)
            with self.tokenizer.as_target_tokenizer():
                labels = self.tokenizer(chunk['tgt_text'][index], return_tensors='pt').input_ids #.to(device)
            inputs['labels'] = labels
            length = (v.shape[1] for k, v in inputs.items())
            return inputs, length

In [53]:
def collate(batch):
    print(batch)
    batch, length = zip(*batch)
    ids, mask, label = zip(*length)
    max_ids, max_mask, max_label = max(ids), max(mask), max(label)
    ids, mask, label = list(ids), list(mask), list(label)
    
    ids_res, mask_res, label_res = [], [], []
    for i, sample in enumerate(batch):
        len_ids = max_ids - ids[i]
        len_mask = max_mask - mask[i]
        len_label = max_label - label[i]
        ids_tensor = torch.cat([sample['input_ids'], torch.tensor([[tokenizer.pad_token_id] * len_ids], device=device)], dim=1)
        mask_tensor = torch.cat([sample['attention_mask'], torch.tensor([[0] * len_mask], device=device)], dim=1)
        label_tensor = torch.cat([sample['labels'], torch.tensor([[tokenizer.pad_token_id] * len_label], device=device)], dim=1)
        ids_res.append(ids_tensor)
        mask_res.append(mask_tensor)
        label_res.append(label_tensor)
    ids_batch = torch.cat(ids_res, dim=0)
    mask_batch = torch.cat(mask_res, dim=0)
    label_batch = torch.cat(label_res, dim=0)
    return {'input_ids':ids_batch, 'attention_mask':mask_batch, 'labels':label_batch}

In [54]:
path = 'C:/Users/CPB06GameN/글을쓰자/PyTorch-master/연습폴더/cleansing_json'
dataset = CustomDataset(path, tokenizer)
val_len = len(dataset) * 0.3
# train, valid = random_split(dataset, [len(dataset)-val_len, val_len])
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=4, collate_fn=collate)

## Knowledge Distillation

In [55]:

for i in train_dataloader:
    print(i)

KeyError: 158955