In [6]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for sequence classification on GLUE."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import logging
import os
import random
import sys
import math
from dataclasses import dataclass, field
from typing import Optional
from accelerate import Accelerator

import datasets
import numpy as np
from datasets import load_dataset, load_metric
import torch
import json

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    TrainingArguments,
    default_data_collator,
    get_scheduler,
    set_seed,
    Trainer
)
# from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_glue import LGTMTeacher,cal_loss

In [11]:
class ModelArgs:
    def __init__(self):
        self.config_name = None
        self.model_name_or_path = '/mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/stsb/bert-base-uncased-stsb/'
        self.cache_dir = None
        self.model_revision = 'main'
        self.use_auth_token = False
        self.num_layers = 6
        self.use_fast_tokenizer = True
        self.tokenizer_name = None

class DataArgs:
    def __init__(self):
        self.task_name = "stsb"

# 创建ModelArgs实例
model_args = ModelArgs()
data_args = DataArgs()
num_labels = 1

config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
    num_hidden_layers=model_args.num_layers
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

Some weights of the model checkpoint at /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/stsb/bert-base-uncased-stsb/ were not used when initializing BertForSequenceClassification: ['bert.encoder.layer.6.output.dense.bias', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.output.LayerNorm.bias', 'bert.encoder.layer.7.intermediate.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.bias', 'bert.encoder.layer.6.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.key.bias', 'bert.encoder.layer.6.attention.self.query.bias', 'bert.encoder.layer.7.attention.output.dense.weight', 'bert.encoder.layer.10.attention.self.value.weight', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.7.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.output.LayerNorm.bias', 'bert.encoder.layer.11.attention.self.key.bias', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.

In [29]:
input3072_size = [64,1,48]
input768_size = [32,1,24]

input4096_size = [64,64]
input1024_size = [32,32]

input512_size = [32,1,1,1,1,1,1,1,1,1,1,1,1,1,16]
input2048_size = [16,2,1,1,1,1,1,1,1,1,1,1,1,2,32]

In [30]:
SHAPE_CONFIG = {
    "attention":(input768_size,input768_size),
    "FFN_1":(input3072_size,input768_size),
    "FFN_2":(input768_size,input3072_size)
}


def _determine_type(name):
    if 'intermediate' in name:
        return 'FFN_1'
    elif 'output' in name and 'attention' not in name:
        return 'FFN_2'
    elif 'query' in name or 'key' in name or 'value' in name or ('attention' in name and 'output' in name):
        return 'attention'



In [33]:
print("#########################################################################################################")
print("参数量："+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
print("#########################################################################################################")

#########################################################################################################
参数量：66955777
#########################################################################################################


In [36]:
# 获取name
basic_choose_name = {}
gradient_mask = dict()
gradient_name_mask = dict()
for name, params in model.named_parameters():
    if 'layer' in name and _determine_type(name) in 'FFN_1,FFN_2,attention' and name not in basic_choose_name:
        gradient_mask[params] = params.new_ones(params.size())
        gradient_name_mask[name] = 0
choose_name = set()
for target_name in ['intermediate.dense.weight','output.dense.weight','attention.self.query.weight','attention.self.key.weight','attention.self.value.weight','attention.output.dense.weight']:
    target_group = {k:v for k,v in gradient_name_mask.items() if (target_name in k) and ("bias" not in k) and (target_name not in basic_choose_name)}
    sort_g = dict(sorted(target_group.items(), key=lambda x :x[1], reverse=True))
    choose_name.update(sort_g)

In [52]:
import sys
import os
from pathlib import Path

# 获取当前工作目录
current_dir = Path(os.getcwd())

# 获取上一级目录的路径
parent_dir = current_dir.parent

# 将上级目录添加到sys.path中
sys.path.append(str(parent_dir))

import re
from tqdm import tqdm
from compress_tools.Matrix2MPO import MPO
from compress_tools.MPOtorch import LinearDecomMPO
def fine_grained_decomposition(module_name):
        # get_module_from_name 
        ind = re.findall(r"\d+",module_name)[0]
        module_name = module_name.replace(f".{ind}",f"[{ind}]")
        module_name = module_name.replace(".weight","")
        layer_module = eval("model."+module_name)

        type_name = _determine_type(module_name)
        FINE_INPUT_SHAPE, FINE_OUTPUT_SHAPE = SHAPE_CONFIG[type_name]
        mpo = MPO(FINE_INPUT_SHAPE, FINE_OUTPUT_SHAPE, 10000)
        device = layer_module.weight.device
        # mpo_tensor_set, _, _ = mpo.matrix2mpo(layer_module.get_weight().cpu().detach().numpy()) # .query_mpo
        #得到分解tensor
        mpo_tensor_set, _, _ = mpo.matrix2mpo(layer_module.weight.cpu().detach().numpy()) # .query
        bias = layer_module.bias
        # (1) TODO: mpo decomposition for MPO module
        # layer_module.from_pretrained(None, None, mpo_tensor_set, layer_module.bias)
        # (2) mpo decomposition for ori module
        obj_name, weight_name = module_name.rsplit('.',1)
        obj = eval("model."+obj_name)
        setattr(obj, weight_name, LinearDecomMPO(FINE_INPUT_SHAPE, FINE_OUTPUT_SHAPE, None))
        layer_module_new = eval("model."+module_name)
        # 放进device中
        layer_module_new.from_pretrained(None, None, mpo_tensor_set, bias, device=device)

In [53]:
# 分解
for name, params in model.named_parameters():
    if name in choose_name:
        print(name)
        fine_grained_decomposition(name)

bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.1.attention.self.query.weight
bert.encoder.layer.1.attention.self.key.weight
bert.encoder.layer.1.attention.self.value.weight
bert.encoder.layer.1.attention.output.dense.weight
bert.encoder.layer.1.intermediate.dense.weight
bert.encoder.layer.1.output.dense.weight
bert.encoder.layer.2.attention.self.query.weight
bert.encoder.layer.2.attention.self.key.weight
bert.encoder.layer.2.attention.self.value.weight
bert.encoder.layer.2.attention.output.dense.weight
bert.encoder.layer.2.intermediate.dense.weight
bert.encoder.layer.2.output.dense.weight
bert.encoder.layer.3.attention.self.query.weight
bert.encoder.layer.3.attention.self.key.weight
bert.encoder.layer.3.attention.self.value.weight
ber

In [54]:
print("#########################################################################################################")
print("参数量："+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
print("#########################################################################################################")

#########################################################################################################
参数量：162507265
#########################################################################################################


In [71]:
for i in range(1,7):
    print(i)

1
2
3
4
5
6


In [58]:
from transformers import BertModel, BertConfig

class BertModelCustom(BertModel):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
        outputs = super().forward(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=False,
            output_hidden_states=True
        )

        # outputs.hidden_states是一个元组，包含每一层的输出，长度等于层数加1（包括初始的embedding输出）
        return outputs.hidden_states

# 现在用自定义类创建模型
model = BertModelCustom.from_pretrained(
    model_args.model_name_or_path,
    config=config
)

Some weights of the model checkpoint at /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/stsb/bert-base-uncased-stsb/ were not used when initializing BertModelCustom: ['bert.encoder.layer.6.output.dense.bias', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.output.LayerNorm.bias', 'bert.encoder.layer.7.intermediate.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.bias', 'bert.encoder.layer.6.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.key.bias', 'bert.encoder.layer.6.attention.self.query.bias', 'bert.encoder.layer.7.attention.output.dense.weight', 'bert.encoder.layer.10.attention.self.value.weight', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.7.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.output.LayerNorm.bias', 'bert.encoder.layer.11.attention.self.key.bias', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.11.outpu

In [59]:
model = BertModelCustom.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

Some weights of the model checkpoint at /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/stsb/bert-base-uncased-stsb/ were not used when initializing BertModelCustom: ['bert.encoder.layer.6.output.dense.bias', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.output.LayerNorm.bias', 'bert.encoder.layer.7.intermediate.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.bias', 'bert.encoder.layer.6.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.key.bias', 'bert.encoder.layer.6.attention.self.query.bias', 'bert.encoder.layer.7.attention.output.dense.weight', 'bert.encoder.layer.10.attention.self.value.weight', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.7.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.output.LayerNorm.bias', 'bert.encoder.layer.11.attention.self.key.bias', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.11.outpu

In [61]:
# 获取name
basic_choose_name = {}
gradient_mask = dict()
gradient_name_mask = dict()
for name, params in model.named_parameters():
    if 'layer' in name and _determine_type(name) in 'FFN_1,FFN_2,attention' and name not in basic_choose_name:
        gradient_mask[params] = params.new_ones(params.size())
        gradient_name_mask[name] = 0
choose_name = set()
for target_name in ['intermediate.dense.weight','output.dense.weight','attention.self.query.weight','attention.self.key.weight','attention.self.value.weight','attention.output.dense.weight']:
    target_group = {k:v for k,v in gradient_name_mask.items() if (target_name in k) and ("bias" not in k) and (target_name not in basic_choose_name)}
    sort_g = dict(sorted(target_group.items(), key=lambda x :x[1], reverse=True))
    choose_name.update(sort_g)

In [62]:
import sys
import os
from pathlib import Path

# 获取当前工作目录
current_dir = Path(os.getcwd())

# 获取上一级目录的路径
parent_dir = current_dir.parent

# 将上级目录添加到sys.path中
sys.path.append(str(parent_dir))

import re
from tqdm import tqdm
from compress_tools.Matrix2MPO import MPO
from compress_tools.MPOtorch import LinearDecomMPO
def fine_grained_decomposition(module_name):
        # get_module_from_name 
        ind = re.findall(r"\d+",module_name)[0]
        module_name = module_name.replace(f".{ind}",f"[{ind}]")
        module_name = module_name.replace(".weight","")
        layer_module = eval("model."+module_name)

        type_name = _determine_type(module_name)
        FINE_INPUT_SHAPE, FINE_OUTPUT_SHAPE = SHAPE_CONFIG[type_name]
        mpo = MPO(FINE_INPUT_SHAPE, FINE_OUTPUT_SHAPE, 10000)
        device = layer_module.weight.device
        # mpo_tensor_set, _, _ = mpo.matrix2mpo(layer_module.get_weight().cpu().detach().numpy()) # .query_mpo
        #得到分解tensor
        mpo_tensor_set, _, _ = mpo.matrix2mpo(layer_module.weight.cpu().detach().numpy()) # .query
        bias = layer_module.bias
        # (1) TODO: mpo decomposition for MPO module
        # layer_module.from_pretrained(None, None, mpo_tensor_set, layer_module.bias)
        # (2) mpo decomposition for ori module
        obj_name, weight_name = module_name.rsplit('.',1)
        obj = eval("model."+obj_name)
        setattr(obj, weight_name, LinearDecomMPO(FINE_INPUT_SHAPE, FINE_OUTPUT_SHAPE, None))
        layer_module_new = eval("model."+module_name)
        # 放进device中
        layer_module_new.from_pretrained(None, None, mpo_tensor_set, bias, device=device)

In [12]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (2): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (3): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (4): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (5): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=1, bias=True)
)


In [None]:
#sst
nohup python run_glue_mpo_laterloss.py --model_name_or_path /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/sst2/bert-base-uncased-finetuned-sst2/ --teacher_model bert-base-uncased --task_name sst2 --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 1e-06 --t_learning_rate 3e-05 --alpha_kd 1.0 --temperature 1.0 --num_train_epochs 3 --output_dir /mnt/zhanyuliang/data/checkpoint/nlp/lgtm/sst2_output/ --eval_steps 20 --do_train --do_eval --train_teacher --init_classifier_to_zero --use_lgtm --overwrite_output_dir >sst.log >&1 &

In [None]:
#qnli
nohup python run_glue_mpo_laterloss.py --model_name_or_path /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/qnli/bert-base-uncased-qnli/ --teacher_model bert-base-uncased --task_name qnli --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 1e-06 --t_learning_rate 3e-05 --alpha_kd 1.0 --temperature 1.0 --num_train_epochs 3 --output_dir /mnt/zhanyuliang/data/checkpoint/nlp/lgtm/qnli_output/ --eval_steps 100 --do_train --do_eval --train_teacher --init_classifier_to_zero --use_lgtm --overwrite_output_dir >qnli.log >&1 &

In [None]:
#qqp
nohup python run_glue_mpo_laterloss.py --model_name_or_path /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/qqp/bert-base-uncased-QQP/ --teacher_model bert-base-uncased --task_name qqp --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 3e-05 --t_learning_rate 1e-06 --alpha_kd 1.0 --temperature 1.0 --num_train_epochs 3 --output_dir /mnt/zhanyuliang/data/checkpoint/nlp/lgtm/qqp_output/ --eval_steps 100 --do_train --do_eval --train_teacher --init_classifier_to_zero --use_lgtm --overwrite_output_dir >qqp_output.log >&1 &

In [None]:
#mrpc
nohup python run_glue_mpo_laterloss.py --model_name_or_path /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/mrpc/bert-base-uncased-glue-mrpc/ --teacher_model bert-base-uncased --task_name mrpc --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 1e-06 --t_learning_rate 3e-05 --alpha_kd 1.0 --temperature 1.0 --num_train_epochs 15 --output_dir /mnt/zhanyuliang/data/checkpoint/nlp/lgtm/mrpc_output/ --eval_steps 5 --do_train --do_eval --train_teacher --init_classifier_to_zero --use_lgtm --overwrite_output_dir >mrpc_output.log >&1 &

In [None]:
# mnli
nohup python run_glue_mpo_laterloss.py --model_name_or_path /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/mnli/bert-base-uncased-MNLI/ --teacher_model bert-base-uncased --task_name mnli --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 1e-06 --t_learning_rate 3e-05 --alpha_kd 1.0 --temperature 1.0 --num_train_epochs 3 --output_dir /mnt/zhanyuliang/data/checkpoint/nlp/lgtm/mnli_output/ --eval_steps 100 --do_train --do_eval --train_teacher --init_classifier_to_zero --use_lgtm --overwrite_output_dir >mnli_output.log >&1 &

In [None]:
nohup python run_glue_mpo_laterloss.py --model_name_or_path /mnt/zhanyuliang/data/nlp_data/theseus/BertFineTrain/download/cola/bert-base-uncased-finetuned-cola/ --teacher_model bert-base-uncased --task_name cola --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 3e-06 --t_learning_rate 5e-06 --alpha_kd 1.0 --temperature 1.0 --num_train_epochs 6 --output_dir /mnt/zhanyuliang/data/checkpoint/nlp/lgtm/cola_output/ --eval_steps 2 --do_train --do_eval --train_teacher --init_classifier_to_zero --use_lgtm --overwrite_output_dir >log/sst_3layer_5e-6_5e-6_output.log >&1 &