<a href="https://colab.research.google.com/github/tjtmddnjswkd/seungwon-seo/blob/master/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import collections
import logging
import os
import pathlib
import re
import string
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [None]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead)
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding,
                                            self.d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]

    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [None]:
sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8,
                         dff=2048, input_vocab_size=8500,
                         maximum_position_encoding=10000)

temp_input = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)

In [None]:
sample_encoder_output = sample_encoder(temp_input, training=False, mask=None)

In [1]:
############# 실제 사용할 코드 #######################

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from sklearn import (datasets, feature_extraction, linear_model, metrics)
import pandas as pd
import os
import zipfile
import shutil
from sklearn.model_selection import train_test_split
import re

In [3]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
order = '/1-35/'

data_path = '/content/drive/Shareddrives/딥러닝팀플/data/paper'
# data_file = os.path.join(data_path, '1-35.zip')

# with zipfile.ZipFile(data_file, 'r') as file: # zip 파일 압축 해제
#   file.extractall('%s/1-35' % data_path)

file_list = os.listdir('%s/1-35' % data_path)

data_path += order

count = 0 
for i in range(len(file_list)):
    count += 1
    if i == 0:
        df = pd.read_excel('%s%s' % (data_path, file_list[i]))
    df = df.append(pd.read_excel('%s%s' % (data_path, file_list[i])))

df = df[['Abstract', 'WoS Categories']]

In [9]:
print(df.isnull().sum())
print('데이터 개수 : %d' % df.shape[0])

Abstract          11177
WoS Categories        0
dtype: int64
데이터 개수 : 35097


In [10]:
df.dropna(inplace=True)

In [11]:
print(df.isnull().sum())
print('데이터 개수 : %d' % df.shape[0])

Abstract          0
WoS Categories    0
dtype: int64
데이터 개수 : 23920


In [12]:
df.drop_duplicates(['Abstract'], inplace=True)

In [13]:
print(df.isnull().sum())
print('데이터 개수 : %d' % df.shape[0])

Abstract          0
WoS Categories    0
dtype: int64
데이터 개수 : 21347


In [14]:
category = []
for i in df['WoS Categories']:
  for j in i.split('; '):
    category.append(j)

In [15]:
unique_category = set(category)
print(unique_category)
print(len(unique_category))

{'Biochemistry & Molecular Biology', 'Energy & Fuels', 'Materials Science, Multidisciplinary', 'Psychiatry', 'Immunology', 'Physics, Applied', 'Nanoscience & Nanotechnology', 'Mathematics, Interdisciplinary Applications', 'Business, Finance', 'Computer Science, Hardware & Architecture', 'Geography', 'Computer Science, Information Systems', 'Psychology, Experimental', 'Automation & Control Systems', 'Oncology', 'Mathematics', 'Mechanics', 'Zoology', 'International Relations', 'Computer Science, Artificial Intelligence', 'Chemistry, Multidisciplinary', 'Chemistry, Physical', 'Engineering, Chemical', 'Biotechnology & Applied Microbiology', 'Management', 'Medicine, General & Internal', 'Mathematics, Applied', 'Language & Linguistics', 'Telecommunications', 'Neurosciences', 'Education & Educational Research', 'Computer Science, Interdisciplinary Applications', 'Genetics & Heredity', 'Sociology', 'Economics', 'Engineering, Electrical & Electronic', 'Geosciences, Multidisciplinary', 'Communic

In [16]:
def preprocessing(text):
    pattern = '(\[a-zA-Z0-9\_.+-\]+@\[a-zA-Z0-9]+.\[a-zA-Z0-9-.\]+)' # email제거
    text = re.sub(pattern=pattern,repl=' ',string=text)    

    pattern = re.compile(r'([^\w]?\d+\.?\,?\)?\d*)+') # 숫자 제거
    text = re.sub(pattern=pattern,repl=' ',string=text)
    
    pattern = '<[^>]*>' # html 태그 제거
    text = re.sub(pattern=pattern,repl=' ',string=text)
    
    pattern = '[\r|\n]' # \r,\n 제거
    text = re.sub(pattern=pattern,repl=' ',string=text)
    
    pattern= '[^\w\s]' # 특수기호 제거
    text = re.sub(pattern=pattern,repl=' ',string=text)
    
    pattern=re.compile(r'\s+')  #  이중 space 제거
    text = re.sub(pattern=pattern,repl=' ',string=text)

    return text

In [17]:
df['Abstract'] = df['Abstract'].astype('string')
df['Abstract'] = df['Abstract'].apply(preprocessing)

In [18]:
# Using Keras for word-level one-hot encoding

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Abstract'])

sequences = tokenizer.texts_to_sequences(df['Abstract'])

one_hot_results = tokenizer.texts_to_matrix(df['Abstract'], mode='binary')

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 61904 unique tokens.


In [19]:
maxlen = 0

for i in sequences:
  if maxlen < len(i):
    maxlen = len(i)

In [20]:
x = sequences
y = list(df['WoS Categories'])

for i in range(len(y)):
    label = y[i].split('; ')
    y[i] = label
label_dict = {}
count = 0
for i in y: # label 정수 인코딩을 위한 dictionary 생성
    for j in i:
        if j not in label_dict.keys():
            label_dict[j] = count
            count += 1 

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=.2, random_state=2)
def vectorize_sequences(sequences, dimension=(len(label_dict.keys()))):
  results = np.zeros((len(sequences), dimension))
  count = 0
  for i in sequences:
    for j in i:
      results[count, label_dict[j]] = 1.
    count += 1  
  return results

y_train = vectorize_sequences(y_train)
y_val = vectorize_sequences(y_val)
y_test = vectorize_sequences(y_test)

In [22]:
from tensorflow.keras import preprocessing

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [23]:
maxlen

1208

In [24]:
embed_dim = 100  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 200  # Hidden layer size in feed forward network inside transformer


inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, 10000, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(100, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(len(label_dict.keys()), activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import ModelCheckpoint

filepath = '/content/drive/My Drive/project/model.{epoch:02d}.h5'
modelckpt = ModelCheckpoint(filepath=filepath)
model.compile("adam", "binary_crossentropy", metrics='acc')
history = model.fit(
    x_train, y_train, batch_size=128, epochs=30, validation_data=(x_val, y_val)
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

In [None]:
!nvidia-smi