# 1. Training Example

## Step 1.1. Load Dataset

In [1]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader
from datasets import load_from_disk

In [2]:
tokenizer = BertTokenizer.from_pretrained('F:\\model\\bert-base-uncased')

In [3]:
batch_size=64
train_dataset = load_from_disk("wiki_for_sts_32")
train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last = True)

## Setp 1.2. init model

In [5]:
from mocose import *

In [7]:
from transformers import BertConfig

In [8]:
config = BertConfig()

In [9]:
config.out_size=768
config.mlp_layers=2
config.proj_layers=1

In [10]:
config.fgsm = 5e-9
config.embedding_drop_prob = 0.1
config.token_drop_prob = 0
config.feature_drop_prob = 0
config.token_shuffle = False

In [11]:
config.K = 512
config.K_start = 128
config.ema_decay = 0.75

In [12]:
model = MoCoSEModel(config)

In [13]:
model.online_embeddings.load_state_dict(torch.load('F:\\model\\bert-base-uncased-weights\\embeddings.pth'))
model.online_encoder.load_state_dict(torch.load('F:\\model\\bert-base-uncased-weights\\encoder.pth'))
model.online_pooler.dense.load_state_dict(torch.load('F:\\model\\bert-base-uncased-weights\\pooler_dense.pth'))

<All keys matched successfully>

In [14]:
model.prepare()

In [15]:
model = model.cuda()

In [16]:
non_optimizer_list = [model.target_encoder,model.target_pooler]
for layer in non_optimizer_list:
    for para in layer.parameters():
        para.requires_grad = False

## Step 1.3. set train arguments

In [17]:
from mocose_tools import *

In [19]:
from transformers.trainer import TrainingArguments

In [20]:
args = TrainingArguments(
    output_dir = 'F:\\trained_model\\mocose_base_out\\',
    evaluation_strategy   = "steps",
    eval_steps            = 100,
    learning_rate         = 3e-5,
    num_train_epochs      = 1.0,
    weight_decay          = 1e-6,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size  = 64,
)

In [21]:
trainer = MoCoSETrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [None]:
trainer.train()

# 2. Eval Model on STS Task

## Step 2.1. prepare model

In [1]:
from mocose import *

In [3]:
model = MoCoSEModel.from_pretrained('F:\\trained_model\\77.27')

In [4]:
model = model.cuda()

## Step 2.2 evaluate model

In [5]:
from mocose_tools import *

In [6]:
from transformers import BertTokenizer

In [7]:
tokenizer = BertTokenizer.from_pretrained('F:\\model\\bert-base-uncased')

In [8]:
sum_acc = evalModel(model,tokenizer, pooler = 'cls_before_pooler')

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 71.47 | 81.41 | 74.47 | 83.45 | 78.99 |    78.68     |      72.44      | 77.27 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+


# 3. Eval Model on Transfer Task

## step 3.1 prepare model

In [1]:
from mocose import *

In [2]:
model = MoCoSEModel.from_pretrained('F:\\trained_model\\77.27')

In [3]:
model = model.cuda()

## Step 3.2 evaluate model

In [4]:
from mocose_tools import *

In [5]:
from transformers import BertTokenizer

In [7]:
tokenizer = BertTokenizer.from_pretrained('F:\\model\\bert-base-uncased')

In [None]:
sum_acc = evalTransferModel(model,tokenizer, pooler = 'cls_before_pooler')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
