<a href="https://colab.research.google.com/github/ysys143/ml2024/blob/main/fine_tuning_KOTE_250K_post_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 파인튜닝
구글 마운트, 허깅페이스 로그인

In [23]:
from google.colab import drive
from google.colab import userdata
from huggingface_hub import login
drive.mount('/content/drive')
HF_TOKEN = userdata.get('HF_TOKEN')
login(HF_TOKEN, add_to_git_credential=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [24]:
!pip install -q pytorch-lightning

### 파인튜닝 시작

In [25]:
!export CUDA_LAUNCH_BLOCKING=1

In [26]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import ElectraModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm

class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment = str(self.data.iloc[index]['comment'])
        label = self.data.iloc[index]['sentiment']

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class KOTESentimentTagger(pl.LightningModule):
    def __init__(self, pretrained_path):
        super().__init__()
        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base", revision='v2021')
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base", revision='v2021')
        self.intermediate_classifier = nn.Linear(self.electra.config.hidden_size, 44)
        self.final_classifier = nn.Linear(44, 3)  # New layer for 3-class classification

        # Load pretrained weights
        pretrained_state_dict = torch.load(pretrained_path, map_location=torch.device('cpu'))
        # Only load weights from the electra model, not the old classifier
        self.load_state_dict(pretrained_state_dict, strict=False)

        # Unfreeze all layers for fine-tuning
        for param in self.parameters():
            param.requires_grad = True

        self.electra.train()

    def forward(self, input_ids, attention_mask):
        self.electra.train()
        output = self.electra(input_ids, attention_mask=attention_mask)
        output = output.last_hidden_state[:,0,:]  # Shape: (batch_size, 768)
        intermediate_output = self.intermediate_classifier(output)  # Shape: (batch_size, 44)
        output = self.final_classifier(intermediate_output)  # Shape: (batch_size, 3)
        output = torch.sigmoid(output)
        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=2e-7)

def train_model(train_csv, pretrained_path, batch_size=64, max_epochs=10): ## batch_size는 A100 기준 64가 적당(gpu 주의단계)
    # Load the data
    df = pd.read_csv(train_csv)

    # Initialize the model
    model = KOTESentimentTagger(pretrained_path)

    # Create dataset and dataloader
    train_dataset = SentimentDataset(df, model.tokenizer)
    train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=11, # L4: 11, T4: 7
    pin_memory=True  # Add this for faster data transfer to GPU
    )


    # Define callback
    checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename='best-checkpoint',
        save_top_k=1,
        verbose=True,
        monitor='train_loss',
        mode='min'
    )

    # Initialize trainer
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        callbacks=[checkpoint_callback],
        accelerator="gpu",
        devices=1,
        precision="16-mixed"
    )

    # Train the model
    trainer.fit(model, train_loader)

    # Load the best model
    best_model_path = checkpoint_callback.best_model_path
    best_model = KOTESentimentTagger.load_from_checkpoint(best_model_path, pretrained_path=pretrained_path)

    return best_model

def process_csv(input_file, output_file, model):
    df = pd.read_csv(input_file)
    df['sentiment_label'] = -1  # Initialize with -1

    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    with torch.no_grad():
        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing comments", position=0, leave=True):
            comment = row['comment']
            if pd.isna(comment) or comment.strip() == '':
                continue  # Skip empty or NaN comments

            encoding = model.tokenizer.encode_plus(
                comment,
                add_special_tokens=True,
                max_length=512,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            output = model(input_ids, attention_mask)
            sentiment_label = torch.argmax(output, dim=1).item()

            df.at[index, 'sentiment_label'] = sentiment_label

    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Usage example
if __name__ == "__main__":
    pretrained_path = "/content/drive/MyDrive/2024-NIPA-google-trendpop/model/JS/fine_tuned_model_with_kote_250K.bin"
    train_csv = "/content/drive/MyDrive/2024-NIPA-google-trendpop/final_output/training_data/learning_set_train_17000_oversampled.csv"

    try:
        trained_model = train_model(train_csv, pretrained_path)
    except KeyboardInterrupt:
        print("Training interrupted. Exiting...")
        sys.exit(1)  # Use sys.exit to exit gracefully


  pretrained_state_dict = torch.load(pretrained_path, map_location=torch.device('cpu'))
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                    | Type         | Params | Mode 
-----------------------------------------------------------------
0 | electra                 | ElectraModel | 123 M  | train
1 | intermediate_classifier | Linear       | 33.8 K | train
2 | final_classifier        | Linear       

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 266: 'train_loss' reached 1.02314 (best 1.02314), saving model to '/content/checkpoints/best-checkpoint-v2.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 532: 'train_loss' reached 0.92156 (best 0.92156), saving model to '/content/checkpoints/best-checkpoint-v2.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 798: 'train_loss' reached 0.85194 (best 0.85194), saving model to '/content/checkpoints/best-checkpoint-v2.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 1064: 'train_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 1330: 'train_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 5, global step 1596: 'train_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 6, global step 1862: 'train_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero

In [27]:
# prompt: save the current model with torch
torch.save(trained_model.state_dict(), "/content/drive/MyDrive/2024-NIPA-google-trendpop/model/JS/fine_tuned_model_with_kote_250K_after_trained.bin")

In [28]:
## test 1000 data
#trained_path = "/content/drive/MyDrive/2024-NIPA-google-trendpop/model/kote_fine_tuned_model_12000.bin"
#trained_path = "/content/drive/MyDrive/2024-NIPA-google-trendpop/model/JS/fine_tuned_model_with_kote_250K.bin"
trained_path = "/content/drive/MyDrive/2024-NIPA-google-trendpop/model/JS/fine_tuned_model_with_kote_250K_post_trained.bin"
trained_model = KOTESentimentTagger(trained_path)

input_file = "/content/drive/MyDrive/2024-NIPA-google-trendpop/labeling/quota_sample_1000_use_this.csv"
output_file =  "/content/drive/MyDrive/2024-NIPA-google-trendpop/labeling/quota_sample_1000_test.csv"
process_csv(input_file, output_file, trained_model)

  pretrained_state_dict = torch.load(pretrained_path, map_location=torch.device('cpu'))
Processing comments: 100%|██████████| 1000/1000 [00:13<00:00, 74.72it/s]


Results saved to /content/drive/MyDrive/2024-NIPA-google-trendpop/labeling/quota_sample_1000_test.csv
