In [None]:
!pip install transformers

In [None]:
import os

In [None]:
from google.colab import drive 
drive.mount('/content/mnt')

In [None]:
import pandas as pd

dataset = pd.read_csv("/content/mnt/MyDrive/data/IMDB Dataset.csv", encoding='cp949')

In [None]:
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split

train_idx, test_idx,_ ,_ = train_test_split(dataset.index, dataset.sentiment, 
                                            test_size=0.2, stratify=dataset.sentiment)
train_set = dataset.iloc[train_idx]
test_set = dataset.iloc[test_idx]
train_set.reset_index(drop=True, inplace=True)

In [None]:
train_idx, valid_idx, _, _ = train_test_split(train_set.index, train_set.sentiment, 
                                              test_size=0.3, stratify=train_set.sentiment)
valid_set = train_set.iloc[valid_idx]
train_set = train_set.iloc[train_idx]

In [None]:
train_set.shape, valid_set.shape, test_set.shape

In [None]:
import torch

class BertDataset(torch.utils.data.Dataset):

    def __init__(self, reviews, sentiments, tokenizer):
        self.reviews    = reviews
        self.sentiments = sentiments
        self.tokenizer  = tokenizer
        self.max_len    = tokenizer.model_max_length
  
    def __len__(self):
        return len(self.reviews)
  
    def __getitem__(self, index):
        review = str(self.reviews[index])
        sentiments = self.sentiments[index]

        encoded_review = self.tokenizer.encode_plus(
            review,
            add_special_tokens    = True,
            max_length            = self.max_len,
            return_token_type_ids = False,
            return_attention_mask = True,
            return_tensors        = "pt",
            padding               = "max_length",
            truncation            = True
        )

        return {
            'input_ids': encoded_review['input_ids'][0],
            'attention_mask': encoded_review['attention_mask'][0],
            'labels': torch.tensor(sentiments, dtype=torch.long)
        }

In [None]:
from transformers import BertTokenizerFast

bert_token_model = 'bert-base-uncased'
bert_model_name = bert_token_model #'bert-large-uncased'

tokenizer = BertTokenizerFast.from_pretrained(bert_token_model)

In [None]:
train_set_dataset = BertDataset(
    reviews    = train_set.review.tolist(),
    sentiments = train_set.sentiment.tolist(),
    tokenizer  = tokenizer,
)

valid_set_dataset = BertDataset(
    reviews    = valid_set.review.tolist(),
    sentiments = valid_set.sentiment.tolist(),
    tokenizer  = tokenizer,
)

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(bert_model_name)

In [None]:
tl_strategy = 3

if tl_strategy == 1:
  for name, param in model.bert.named_parameters():
    print(name)
    param.requires_grad = False

elif tl_strategy == 2:
  for name, param in model.bert.named_parameters():
    if not name.startswith('pooler'):
      param.requires_grad = False

elif tl_strategy == 3:
  for name, param in model.bert.named_parameters():
    if ( not name.startswith('pooler') ) and "layer.23" not in name :
      param.requires_grad = False

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_dir = './model'

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

training_args = TrainingArguments(
    output_dir                  = model_dir,
    num_train_epochs            = 1,
    per_device_train_batch_size = 128,
    per_device_eval_batch_size  = 64,
    warmup_steps                = 500,
    weight_decay                = 0.01,
    save_strategy               = "epoch",
    evaluation_strategy         = "steps"
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_set_dataset,
    eval_dataset    = valid_set_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

In [None]:
test_set_dataset = BertDataset(
    reviews    = test_set.review.tolist(),
    sentiments = test_set.sentiment.tolist(),
    tokenizer  = tokenizer,
)

training_args = TrainingArguments(
    output_dir = "./model",
    do_predict = True
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    compute_metrics = compute_metrics,
)

trainer.predict(test_set_dataset)