In [1]:
# import torch
import pandas as pd
import regex as re

In [2]:
report = pd.read_excel('CXIRG_Data\\train_data\\reports.xlsx', engine='openpyxl')


def preprocess_text(text):
    text = '[CLS] ' + text
    text = re.sub('_x000D_', ' ', text)
    text = re.sub('[0-9]\)|>|-|[0-9]\.|', '', text)
    # print(text)
    gptpat = re.compile(r"""\[[C][L][S]]|\n|[:.,]| [L]4+| *3+[rd]+| *4+[th]+| [LR]\'t|[LR]\'t| T[0-9]+|T[0-9]+| [a-zA-Z]/[a-zA-Z]|[a-zA-Z]/[a-zA-Z]| ?\p{L}+| ?\p{N}+""")
    text = re.findall(gptpat, text)
    tokens = []
    for token in text:
        if len(token) > 0:
            tokens.append(token)
    
    def check_token_head(tokens):
        while tokens[0] == ' ':
            tokens = tokens[1:]
            
        return tokens
    
    tokens = check_token_head(tokens)
    
    # print(len(tokens))
    
    return tokens



report_texts = report['text'].apply(preprocess_text)
# report_texts[17]
len(report_texts)


89

In [None]:

def create_dict(df):
    vocab_dict = {}
    for row in df:
        for token in row:
            vocab_dict[token] = vocab_dict.get(token, 0) + 1
    return vocab_dict

vocab_dict = create_dict(report_texts)
sorted_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1], reverse=True)}
len(sorted_dict) ,sorted_dict

In [4]:
# word piece tokenize
def build_vocab(word_dict):
    itos = {}
    stoi = {}
    for i, token in enumerate(word_dict.items()):
        stoi[token[0]] = i
        itos[i] = token[0]
    return itos, stoi

itos, stoi = build_vocab(sorted_dict)

In [5]:
encode = lambda s : [stoi[c] for c in s]
decode = lambda l : ''.join([itos[i] for i in l])

In [6]:
text = '''
Chest plain film shows:
Impression:
-Compatible with right-sided aortic arch with aberrant left subclavian artery and Kommerell diverticulum.
-Suspect bilateral lower lung patches. 
-Increased infiltrations in both lungs.
-Blunting bilateral CP angles.
-Tortuous atherosclerotic dilated aorta.
-Cardiomegaly.
-Scoliosis, DJD and osteoporosis of spine.
 Fracture of right ribs. 
S/P Rt subclavian CVC insertion. 
'''

# encode(preprocess_text(text))
decode(encode(preprocess_text(text)))

'[CLS]\nChest plain film shows:\nImpression:\nCompatible with rightsided aortic arch with aberrant left subclavian artery and Kommerell diverticulum.\nSuspect bilateral lower lung patches.\nIncreased infiltrations in both lungs.\nBlunting bilateral CP angles.\nTortuous atherosclerotic dilated aorta.\nCardiomegaly.\nScoliosis, DJD and osteoporosis of spine.\n Fracture of right ribs.\nS/P Rt subclavian CVC insertion.\n'

In [7]:
class CustomTokenizer():
    
    def __init__(self, word_dict):
        self.word_dict = word_dict
        
        def build_vocab(word_dict):
            itos = {}
            stoi = {}
            for i, token in enumerate(word_dict.items()):
                stoi[token[0]] = i
                itos[i] = token[0]
            return itos, stoi
        
        self.itos, self.stoi = build_vocab(self.word_dict)
        self.encode_ = lambda s : [stoi[c] for c in s]
        self.decode_ = lambda l : ''.join([itos[i] for i in l])

    def encode(self, text):
        return self.encode_(text)
    
    def decode(self, tokens):
        return self.decode_(tokens)
        

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random
import math

class CustomReportDataset(Dataset):
    
    def __init__(self, text_df, split='train', word_dict=None):
        assert split in ['train', 'valid', 'test']
        n_samples = {
            'train' : len(text_df) * 0.9,
            'valid' : len(text_df) * 0.1,
            'test' : len(text_df),
        }[split]
        if type(text_df) is not list:
            text_df = list(text_df)
        self.df = random.sample(text_df, int(n_samples))
        self.tokenizer = CustomTokenizer(word_dict)
        
    def __getitem__(self, index) :
        target = self.df[index]
        target = torch.tensor(self.tokenizer.encode(target))
        return target
        
    # def decode(self, tokens):
    def __len__(self):
        return len(self.df)
        
        

In [9]:
# tokenizer = tokenizer(sorted_dict)
train_df = CustomReportDataset(report_texts, split='train', word_dict=sorted_dict)

In [10]:
batch_size = 16
block_size = 8

In [96]:
def CustomBlockSeq2Batch(df, block_size, batch_size, threshold=50, device=None, target_idx=None):
    
    # get rid of the sequence that len < threshold
    # n_df = []
    # for idx, data in enumerate(df):
    #     if len(data) >= threshold: n_df.append(data)
    
    # get random batch
    if target_idx == None: target_idx = random.randint(0, len(df) - 1)
    ix = torch.randint(len(df[target_idx]) - block_size, (batch_size, ))
    ix[0] = 0                                 # test for make sure CLS
    x = torch.stack([df[target_idx][i:i+block_size] for i in ix])
    y = torch.stack([df[target_idx][i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = CustomBlockSeq2Batch(train_df, block_size, batch_size, device='cpu')
# x, y, x.shape, y.shape
    

In [None]:
# testing batch

for b in range(batch_size):
    for t in range(8):
        context = x[b, :t+1]
        target = y[b, t]
        print(f'when input is {decode(context.tolist())} the target : {(target)}')

In [74]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, n_head, n_embd):
        super().__init__()
        
        self.n_embd = n_embd
        self.n_head = n_head
        
        self.c_attn = nn.Linear(n_embd, n_embd * 3)
        self.c_proj = nn.Linear(n_embd, n_embd)
        
    def forward(self, x):
        
        # batch_size, Seq_len, embedding dim
        B, T, C = x.shape
        # print(x.shape)
        # after c_attn(x), the shape is B, T, n_embd * 3
        a = self.c_attn(x)
        q, k, v = a.split(self.n_embd, dim=2)
        # start view() & transpose()
        # shape after transpose (Batch_size, n_head, Seq_len, n_embd // n_head) 
        # or (B, n_head, T, C // n_head)
        q = q.view(B, T, self.n_head, self.n_embd // self.n_head).transpose(2, 1)
        k = k.view(B, T, self.n_head, self.n_embd // self.n_head).transpose(2, 1)
        v = v.view(B, T, self.n_head, self.n_embd // self.n_head).transpose(2, 1)
        # the formula : softmax(QK^T / sqrt(embd_dim(k)))V
        # shape after q @ k : (B, n_head, T, T) 
        attn = q @ k.transpose(-2, -1) * (1 / math.sqrt(self.n_embd * 3 // self.n_head))
        attn = F.softmax(attn, dim=-1)
        # shape after attn @ v : (B, n_head, T, C // n_head)
        y = attn @ v
        y = y.transpose(2, 1).contiguous().view(B, T, C)
        self.out = self.c_proj(y)
        return self.out   
    
class FeedForward(nn.Module):
    
    def __init__(self, n_embd, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
        
    def forward(self, x):
        # x shape (B, T, C)
        x = x + self.sa(self.ln1(x))        # (B, T, C)
        x = x + self.ffwd(self.ln2(x))      # (B, T, C)
        return x
    
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, block_size, n_embd, n_head, device, n_layer=8):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = torch.nn.Embedding(block_size, n_embd)
        
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.device = device
        
    def forward(self, idx, targets=None):
        
        B, T = idx.shape
        
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=self.device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        
        x = self.blocks(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        
        if targets == None:
            loss = None
        else:
            
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T)
        for _ in range(max_new_tokens):
            # get predictions
            idx_cond = idx[:, -block_size:] # prevent longer block_size, because we just have pos. embd
            logits, loss = self(idx_cond) # now (B, T, C)
            logits = logits[:, -1, :] # now get the last step and shape (B, C)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
        

In [125]:
vocab_size = len(stoi)
block_size = 8
batch_size = 32
n_embd = 512
n_head = 32
lr = 1e-5
max_iters = 10000
n_layer = 32

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [126]:
model = Decoder(vocab_size=vocab_size, block_size=block_size, n_embd=n_embd, n_head=n_head, n_layer=n_layer, device='cuda')

# test model
x, y = CustomBlockSeq2Batch(train_df, block_size, batch_size, device='cuda')
m = model.to(device)

logits, loss = m(x, y)
loss, logits

(tensor(7.0380, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor([[-0.5126, -1.4193,  1.0005,  ..., -1.5657, -2.2598,  0.0378],
         [-0.3944, -0.5534,  2.1017,  ...,  0.5943, -1.9581, -0.7326],
         [-1.1588, -0.9734,  0.7828,  ...,  0.1855, -0.3597, -0.7771],
         ...,
         [-1.6233, -1.5340,  2.1736,  ...,  2.1717, -1.7226, -0.5432],
         [ 0.7713, -0.7180,  1.4409,  ...,  0.9081, -0.4820,  0.9586],
         [ 0.5734, -0.1359,  0.9323,  ...,  0.9893,  0.3333, -1.1376]],
        device='cuda:0', grad_fn=<ViewBackward0>))

In [127]:
optimizer = torch.optim.AdamW(m.parameters(), lr=lr)

for iter in range(max_iters):
    
    xb, yb = CustomBlockSeq2Batch(train_df, block_size, batch_size, device='cuda', target_idx=iter%len(train_df))
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    if iter % 1000 == 0:
        print(f'{iter:5d} / {max_iters} loss : {loss.item():.5f}')
        context = torch.tensor([[stoi['[CLS]']]], dtype=torch.long, device=device)
        print('------gen------')
        print(decode(m.generate(context, max_new_tokens=60)[0].tolist()))
        print('---------------')



    0 / 10000 loss : 7.21426
------gen------
[CLS] Blunted conduction CompatibleCompatible, airspace SuspIncreased ray occupying lateral consolidation internal Cardiomegalys/p Susp lobectomy Pulmonary lateral lateral nodules nodules partialP fields Osteophytes evaluation bony ViewCalcified
 notIMP out Scoliosis nodule outStent CVP of may infiltrations vein conductionRadiopaque hilaeBlunting nodules pneumoniaRight consider heart R't heartCompatible aortic artery
PresenceDilated
---------------
 1000 / 10000 loss : 0.50224
------gen------
[CLS] Chest Spondylosis CT may, favor ruled of right middle lung fields.
Elevation of right hemidiaphragm.
 Scoliosis of right hemidiaphragm right hemidiaphragm.
DJD of right hemidiaphragm.
DJD of right hemidiaphragm.
DJD of spine.
DJD of spine.
S/P right upper abdomen.
S/P right upper chest
---------------
 2000 / 10000 loss : 0.07688
------gen------
[CLS] Chest AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP AP A

In [140]:
context = torch.tensor([[stoi['[CLS]']]], dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=50)[0].tolist()))


[CLS] Chest Plain Film:

 Patchy opacified lesion at bilateral lung fields.
 Obscured bilateral costophrenic angles.
 Tortuous atherosclerotic dilated aorta.
 Scoliosis and DJD of spine.
 Old fracture of right ribs.
Atherosclerotic aorta.
S/P tracheostomy and endotracheal tube
