google colab 에서 로컬 gpu 사용하기

https://research.google.com/colaboratory/local-runtimes.html

\

dataset 다운로드 주소. google drive에 넣어 사용하세요.

https://www.kaggle.com/datasets/gauravduttakiit/bitcoin-tweets-16m-tweets-with-sentiment-tagged

In [None]:
!pip install tensorflow-text
!pip install tf-models-official

In [2]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub # download preprocess and BERT-model from hub
import tensorflow_text as text
from official.nlp import optimization # adamW

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [None]:
import pandas as pd
import numpy as np
import re

from google.colab import drive
drive.mount('/content/gdrive/') # use mbsa.csv in gdrive

In [None]:
# mbsa.csv를 처리하여 mbsa_processed.csv를 생성
path = '/content/gdrive/My Drive/Colab Notebooks' # dataset path
data = pd.read_csv(path+"/mbsa.csv")
columns_to_keep = ["Sentiment"] + ["text"] # only use "sentiment" and "text"
data = data[columns_to_keep]
data = data[:10000000] # 학습시킬 데이터 양 (전체 19,344,048개)
#data.to_csv("mbsa_processed.csv", index=False)

#data = pd.read_csv("mbsa_processed.csv")

print(data['text'])

In [None]:
def sentiment(x):
  if x == 'Positive':
    return 1
  elif x == 'Negative':
    return 2
  else:
    return 0

# remove tags from tweets
def process_text(sentence):
  #tags = r"@\S*|#\S+|http\S*" # remove @tags #tags httpaddress
  tags = r'http\S*' # remove httpaddress
  sentence = str(sentence)
  sentence = re.sub(tags,"",sentence)
  return sentence

data.text = data.text.map(process_text)
data.Sentiment = data.Sentiment.map(sentiment).astype(np.int32)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['Sentiment'], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)


def make_dataset(Text, Label, batch_size, seed):
    # code from
    # text_dataset_from_directory
    # from_tensor_slices
    labels = 'inferred'
    label_mode = 'int'
    
    # to_categorical
    # [0] -> [1, 0, 0]
    # [1] -> [0, 1, 0]
    # [2] -> [0, 0, 1]
    # if you use loss=CategoricalCrossentropy, you must categorize label first.
    label_ds = tf.keras.utils.to_categorical(Label)
    label_ds = tf.data.Dataset.from_tensor_slices(label_ds)
    
    string_ds = tf.data.Dataset.from_tensor_slices(Text)

    ds = tf.data.Dataset.zip((string_ds, label_ds))
    
    ds = ds.prefetch(tf.data.AUTOTUNE)
    ds = ds.shuffle(buffer_size=batch_size * 8, seed=seed)
    ds = ds.batch(batch_size)
    return ds


In [None]:
batch_size = 32 # 메모리가 적을 경우 2, 4, 8, 16
seed = 42

train_ds = make_dataset(X_train, y_train, batch_size, seed)
val_ds = make_dataset(X_val, y_val, batch_size, seed)
test_ds = make_dataset(X_test, y_test, batch_size, seed)

In [None]:
my_train_ds = train_ds
my_val_ds = val_ds
my_test_ds = test_ds

In [None]:
num_classes = 3 # neutral, positive, negative

In [None]:
my_train_ds = my_train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
my_val_ds = my_val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
my_test_ds = my_test_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
bert_model_name = 'bert_multi_cased_L-12_H-768_A-12'

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name] # download BERT-model from hub
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name] # download preprocess from hub (=tokenizer)

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
# example

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

#print(f'Keys       : {list(text_preprocessed.keys())}')
#print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
#print(f'Word Ids   : {text_preprocessed["input_word_ids"]}')
#print(f'Input Mask : {text_preprocessed["input_mask"]}')
#print(f'Type Ids   : {text_preprocessed["input_type_ids"]}')

bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_results = bert_model(text_preprocessed)

#print(f'Loaded BERT: {tfhub_handle_encoder}')
#print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
#print(f'Pooled Outputs Values:{bert_results["pooled_output"]}')
#print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
#print(f'Sequence Outputs Values:{bert_results["sequence_output"]}')

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']  # pooled_output == CLS vector. normally used
                                    # mean pooling = mean of sequence_output, max pooling = convolution of sequence_output
    # neural network layer
    net = tf.keras.layers.Dropout(0.1)(net) # prevent overfitting
    net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net) # setting use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, etc
    return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

# example
#bert_raw_result = classifier_model(tf.constant(text_test))
#print(tf.constant(text_test))

In [None]:
from keras import backend as K

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
epochs = 5


METRICS = [
    tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
    balanced_recall,
    balanced_precision,
    balanced_f1_score
]

# prevent overfitting with (X_val, y_val)
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 5, restore_best_weights = True)

# optimizer adamw
steps_per_epoch = tf.data.experimental.cardinality(my_train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer, # classifier_model + optimizer + loss function + metrics
                         loss='categorical_crossentropy',
                         metrics=METRICS)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(my_train_ds,
                               validation_data=my_val_ds,
                               epochs=epochs,
                               callbacks=[earlystop_callback])

In [None]:
from sklearn.metrics import classification_report

# dataset -> list
lx_test = [d.numpy() for d in my_test_ds.unbatch().map(lambda x,y:x)]
# dataset -> list -> argmax
ly_test = np.argmax([d.numpy() for d in my_test_ds.unbatch().map(lambda x,y:y)], axis=1)
y_pred = [np.argmax(pred) for pred in classifier_model.predict(lx_test)]
print(classification_report(ly_test, y_pred))

In [None]:
classifier_model.save("/content/gdrive/My Drive/Colab Notebooks/natural/models/natural_v1")
classifier_model = tf.keras.models.load_model("/content/gdrive/My Drive/Colab Notebooks/natural/models/natural_v1", custom_objects={'balanced_recall':balanced_recall, 'balanced_precision':balanced_precision, 'balanced_f1_score':balanced_f1_score, 'AdamWeightDecay': optimizer})

In [None]:
classifier_model.summary()

In [None]:
in_sentences = ["never again",
                "very very good"]
result = classifier_model(tf.constant(in_sentences))
my_labels = ["neutral", "positive", "negative"]

for re in result:
  print(f'{my_labels[np.argmax(re)]} {np.max(re)}')