In [19]:
# Ignore the warnings
import warnings
# warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# System related and data input controls
import os

# Auto reload of library
%reload_ext autoreload
%autoreload 2

# Python path
import sys
base_folder = 'DataScience'
location_base = os.path.join(os.getcwd().split(base_folder)[0], base_folder)
location_module = [os.path.join(location_base, 'Module')] 
for each in location_module:
    if each not in sys.path:
        sys.path.append(each)
        
from import_KK import *
DeviceStrategy_GPU()
from preprocessing_KK import *
from preprocessing_text_KK import * ##
from visualization_KK import * ##
from algorithm_textmining_KK import *
from algorithm_machinelearning_KK import *
from algorithm_deeplearning_KK import *
from evaluation_KK import *


Tensorflow Version:  2.10.0
Keras Version:  2.10.0
Num of Physical GPUs Available:  1
Cuda is ready?  True
Cuda Version:  64_112
Cudnn Version:  64_8 

Torch Version:  2.1.0
Torch Cuda Version: 11.8
Torch Cudnn Version:8700
There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [21]:
# Set a fixed random seed
seed_val = 123
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEQ_LEN = 64
BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 2.0e-5
MODEL_NAME = 'bert-base-multilingual-cased'    # 'bert-base-multilingual-cased', 'klue/roberta-base', 'monologg/kobert'
MODELSAVE_LOCATION = os.path.join(os.getcwd(), 'Model', 
                                  'modeling_BERTsentiment_'+datetime.datetime.today().strftime("%Y%m%d")+'.pt')

# Data Preprocessing

In [None]:
def preprocessing_sentence_to_BERTinput(X_series, Y_series, tokenizer, 
                                        seq_len=128, batch_size=32, sampler='random'):
    # BERT 입력 형식에 맞게 변환
    sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in X_series]
    
    # 전처리
    token_list = [tokenizer.encode_plus(sentence, max_length=seq_len,
                                        pad_to_max_length=True, truncation=True,
                                        return_attention_mask=True,
                                        add_special_tokens=True) for sentence in sentences]
    tokens = [token['input_ids'] for token in token_list]
    masks = [token['attention_mask'] for token in token_list]
    segments = [token['token_type_ids'] for token in token_list]
    targets = Y_series.values
    
    # array 변환
    tokens = np.array(tokens)
    targets = np.array(targets)
    
    # tensor 변환
    tokens = torch.tensor(tokens)
    masks = torch.tensor(masks)
    segments = torch.tensor(segments)
    targets = torch.tensor(targets)
    
    # pytorch dataloader 연결
    data = TensorDataset(tokens, masks, targets)
    if sampler == 'random':
        dataloader = DataLoader(data, sampler=RandomSampler(data), batch_size=batch_size)
    elif sampler == 'sequential':
        dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size)
        
    return dataloader

In [22]:
train = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'MovieReview_NSMC', 'ratings_train.txt'), sep='\t')
test = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'MovieReview_NSMC', 'ratings_test.txt'), sep='\t')

# 데이터 분리
X_train, Y_train = train['document'], train['label']
X_test, Y_test = test['document'], test['label']

# 전처리
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)  
train_dataloader = preprocessing_sentence_to_BERTinput(X_train, Y_train, tokenizer,
                                                       seq_len=SEQ_LEN, batch_size=BATCH_SIZE)
test_dataloader = preprocessing_sentence_to_BERTinput(X_test, Y_test, tokenizer,
                                                       seq_len=SEQ_LEN, batch_size=BATCH_SIZE)

# Modeling

In [23]:
def modeling_BERTsentiment(device, model_name, train_dataloader, validation_dataloader,
                           num_labels=2, epochs=10, learning_rate=1.0e-5, early_stopping_patience=10,
                           save_location=None):
    # 하위 함수
    ## 정확도 계산 함수
    def flat_accuracy(preds, labels):
        pred_flat = np.argmax(preds, axis=0).flatten()
        labels_flat = labels.flatten()

        return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    ## 시간 표시 함수
    def format_time(elapsed):
        # 반올림
        elapsed_rounded = int(round((elapsed)))

        # hh:mm:ss으로 형태 변경
        return str(datetime.timedelta(seconds=elapsed_rounded))
    
    # 모델 로딩
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=learning_rate, # 학습률
                      eps=1e-8) # 0으로 나누는 것을 방지하기 위한 epsilon 값
    
    # 학습
    training_losses, validation_losses = [], []
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
    best_validation_loss = float('inf')
    best_accuracy = 0.0
    patience = early_stopping_patience
    model.zero_grad()
    for epoch_i in range(0, epochs):
        print('\n======== Epoch {:} / {:} ======='.format(epoch_i + 1, epochs))
        print('Training...')

        # Set start time
        t0 = time.time()
        total_loss = 0

        # Switch to training mode
        model.train()
        for step, batch in enumerate(tqdm(train_dataloader)):
            ## Put the batch on the GPU
            batch = tuple(t.to(device) for t in batch)
            ## Extract data from the batch
            b_input_ids, b_input_mask, b_labels = batch
            ## Perform Forward
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            ## Get the loss
            loss = outputs[0]
            total_loss += loss.item()

            # Compute gradients by performing a backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)    # Gradient clipping
            optimizer.step()   # Update model's parameters using the gradients
            model.zero_grad()    # Initialize gradients

        # Calculate the average training loss
        avg_train_loss = total_loss / len(train_dataloader)
        training_losses.append(avg_train_loss)
        print("Average Loss: {0:.2f}".format(avg_train_loss), " Epoch Took: {:}".format(format_time(time.time() - t0)))
        
        # 검증
        print("Validation...")
        t0 = time.time()    # Set start time
        model.eval()    # Switch to evaluation mode
        for batch in validation_dataloader:
            ## Put the batch on the GPU
            batch = tuple(t.to(device) for t in batch)
            ## Extract data from the batch
            b_input_ids, b_input_mask, b_labels = batch

            # Do not calculate gradients during validation
            with torch.no_grad():
                # Perform Forward
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            logits = outputs[0]

            # Ensure that loss is a scalar (0-dimensional tensor) and accumulate the loss
            eval_loss += loss.item()

            # Move data to the CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate accuracy by comparing output logits and labels
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        # Calculate the average validation loss
        avg_validation_loss = eval_loss / nb_eval_steps
        validation_losses.append(avg_validation_loss)

        # Calculate average validation accuracy
        avg_validation_accuracy = eval_accuracy / nb_eval_steps
        print("Validation Loss: {0:.2f}".format(avg_validation_loss), " Validation Accuracy: {0:.2f}".format(avg_validation_accuracy))
        
        # 조기종료
        # Check for early stopping based on validation loss
        if avg_validation_loss < best_validation_loss:
            best_validation_loss = avg_validation_loss
            patience = early_stopping_patience  # Reset patience
        else:
            patience -= 1  # Decrease patience

        # Early stop when patience becomes 0
        if patience == 0:
            print("Early stop: validation loss does not improve for {} epochs".format(early_stopping_patience))
            break

        # Check if the current accuracy is better than the best accuracy
        if avg_validation_accuracy > best_accuracy:
            best_accuracy = avg_validation_accuracy
            if save_location != None:
                torch.save(model.state_dict(), save_location)
            
    return model
        
model = modeling_BERTsentiment(DEVICE, MODEL_NAME, train_dataloader, test_dataloader, epochs=EPOCHS, learning_rate=LEARNING_RATE,
                               save_location=MODELSAVE_LOCATION)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...


  5%|████▏                                                                          | 247/4688 [00:45<13:29,  5.49it/s]


KeyboardInterrupt: 