# Google drive mount

In [1]:
# google drive에 있는 데이터를 사용하기 위해 접근

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# GPU Test

In [2]:
# tensorflow를 import 하고, 현재 colab에서 gpu 구동이 되고 있는지 테스트 하는 코드
# Found GPU at: /device:GPU ~ 가 뜨면 성공

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# library import

In [3]:
import nltk, random, numpy as np, pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, mean_squared_error, r2_score

import matplotlib.pyplot as plt

!pip install lime
from lime.lime_text import LimeTextExplainer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[K     |████████████████████████████████| 275 kB 35.2 MB/s 
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283857 sha256=da16eb5753502289d3ea7a622d58dfbec30ab95b45dc1d2ac65f3ece1738bc9a
  Stored in directory: /root/.cache/pip/wheels/ca/cb/e5/ac701e12d365a08917bf4c6171c0961bc880a8181359c66aa7
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


# CLS/REG Data load

In [4]:
# 경로 저장 
cls_data_path = '/content/drive/MyDrive/숨고/류재용님(transformer 구현)/Training_data_cls.tsv' 
reg_data_path = '/content/drive/MyDrive/숨고/류재용님(transformer 구현)/Training_data_reg.tsv'

# 데이터 불러오기
train_cls = pd.read_csv(cls_data_path, sep="\t")
train_reg = pd.read_csv(reg_data_path, sep="\t")

train_cls.head()

Unnamed: 0,Seq,Label
0,WSHPSFYPFR,1
1,WLMACFFVFR,0
2,WTVDGLYEYD,1
3,WRATSFYLNT,0
4,WRSIAFFMFA,0


In [9]:
# 일부 데이터 추출
train_cls_sample = train_cls.iloc[:5000,:]
x_cls_train_sample = train_cls[['Seq']]
y_cls_train_sample = train_cls[['Label']]
train_reg_sample = train_reg.iloc[:5000,:]
x_reg_train_sample = train_reg[['Seq']]
y_reg_train_sample = train_reg[['Label']]

# Text Preprocessing

In [10]:
vocab = ['A', 'C', 'D', 'E', 
         'F', 'G', 'H', 'I', 
         'K', 'L', 'M', 'N', 
         'P', 'Q', 'R', 'S',
         'T', 'V', 'W', 'Y']

def preprocessing(text):
    text = list(tuple(text))
    tokens = [token for token in text if token in vocab]
    return tokens

x_cls_train_sample['Seq_processed'] = x_cls_train_sample['Seq'].apply(lambda x : preprocessing(x))
x_reg_train_sample['Seq_processed'] = x_reg_train_sample['Seq'].apply(lambda x : preprocessing(x))

In [11]:
def text_to_sequence(text, max_len):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    text_sequence = tokenizer.texts_to_sequences(text)
    vocab_size = len(tokenizer.word_index)
    print('vocab_size : ', vocab_size)
    return np.array(text_sequence), vocab_size, tokenizer

x_train_cls, vocab_size, cls_tokenizer = text_to_sequence(x_cls_train_sample['Seq_processed'], max_len = 10)
x_train_reg, vocab_size, reg_tokenizer = text_to_sequence(x_reg_train_sample['Seq_processed'], max_len = 10)

print(x_train_cls.shape)
print(x_train_reg.shape)

vocab_size :  20
vocab_size :  20
(36391, 10)
(36391, 10)


# Transformer Architecture 


#### Transformer block as a layer
- Self Attention, Normalization, and feed-forward networks
- (https://keras.io/examples/nlp/text_classification_with_transformer/)

In [16]:
class TransformerBlock(layers.Layer):
    def __init__(self, embedding_dim, num_heads, hidden_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(hidden_dim, activation="relu"), layers.Dense(embedding_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)



#### Embedding & Position
- In Transformer-based networks, we need to include positional information of the tokens in the embeddings.

In [17]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embedding_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

#### Classification fine tuning

In [18]:
embedding_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
hidden_dim = 32  # Hidden layer size in feed forward network inside transformer
vocab_size = 20 # total vocab size
maxlen = 10  # Only consider the last 10 words of each row
learning_rate = 0.0001 # learning rate
batch_size = 128
epochs = 100

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, hidden_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 token_and_position_embeddin  (None, 10, 32)           960       
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 10, 32)           10656     
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 32)                0     

In [28]:
# pretrained model load
model.load_weights("/content/drive/MyDrive/숨고/류재용님(transformer 구현)/cls_weights.best.hdf5")

In [29]:
# 전체 layer에 대해 fine tuning
for layer in model.layers:
    layer.trainable = True

In [30]:
optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=15)

history = model.fit(x_train_cls,
                    y_cls_train_sample,
                    callbacks=[es],
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.1,
                    shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 32: early stopping


In [31]:
model.save_weights("/content/drive/MyDrive/숨고/류재용님(transformer 구현)/cls_weights.fine_tuning.hdf5" )

#### Regression fine tuning

In [32]:
embedding_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
hidden_dim = 32  # Hidden layer size in feed forward network inside transformer
vocab_size = 20 # total vocab size
maxlen = 10  # Only consider the last 10 words of each row
learning_rate = 0.0001 # learning rate
batch_size = 128
epochs = 100

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, hidden_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1)(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 10)]              0         
                                                                 
 token_and_position_embeddin  (None, 10, 32)           960       
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_2 (Transf  (None, 10, 32)           10656     
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d_2   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_10 (Dropout)        (None, 32)                0   

In [33]:
# pretrained model load
model.load_weights("/content/drive/MyDrive/숨고/류재용님(transformer 구현)/reg_weights.best.hdf5")

In [34]:
# 전체 layer에 대해 fine tuning
for layer in model.layers:
    layer.trainable = True

In [35]:
optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer="adam",
              loss="mse")

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15)

history = model.fit(x_train_reg,
                    y_reg_train_sample,
                    callbacks=[es],
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.1,
                    shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 16: early stopping


In [36]:
model.save_weights("/content/drive/MyDrive/숨고/류재용님(transformer 구현)/reg_weights.fine_tuning.hdf5" )