In [1]:
import argparse
import logging

import os
import numpy as np
import torch
import tqdm

In [2]:
from transformers import (
    GPT2TimeLMHeadModel,
    GPT2Tokenizer,
)

In [3]:
import sys
sys.path.append('../language-modeling')
from run_time_clm import (
    get_checkpoint,
    get_special_tokens,
    get_data_paths,
    get_dataset)

sys.path.append('../text-generation')
from generation_metrics import GenerationMetrics

2022-11-16 21:20:23.264776: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [4]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [5]:
MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

MODEL_CLASSES = {
    "gpt2": (GPT2TimeLMHeadModel, GPT2Tokenizer),
}

In [6]:
def set_seed(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
        
def adjust_length_to_model(length, max_sequence_length):
    if length < 0 and max_sequence_length > 0:
        length = max_sequence_length
    elif 0 < max_sequence_length < length:
        length = max_sequence_length  # No generation bigger than model size
    elif length < 0:
        length = MAX_LENGTH  # avoid infinite loop
    return length

def simulate_brownian_bridge(B_0, B_T, num_samples, sentence_lengths, dt=0.05, mu=0.0, sigma=1.0):
    """Run bridge forward pinned at B_0 and B_T"""
    if isinstance(B_0, torch.Tensor):
        B_0 = B_0.cpu().detach().numpy()
    if isinstance(B_T, torch.Tensor):
        B_T = B_T.cpu().detach().numpy()

    bridge = [B_0]
    x_t = np.copy(B_0)
    for step in range(num_samples - 2): # number of sentences
        dim = B_0.shape[-1]
        noise = np.sqrt(dt)*sigma*np.random.normal(mu, sigma, dim)
        t = step/num_samples
        x_tp1 = x_t * (1- dt/(1-t)) + (dt/(1-t))*B_T + noise
        length_idx = step % len(sentence_lengths)
        bridge += [x_tp1] * sentence_lengths[length_idx]
        x_t = x_tp1

    length_idx = step % len(sentence_lengths)
    bridge += [B_T] * sentence_lengths[length_idx]

    return bridge

def split_text(raw_text):
    split_pattern = ". "
    split_raw_text = [_ + split_pattern for _ in raw_text.split(split_pattern)]
    split_raw_text[-1] = split_raw_text[-1].rstrip(split_pattern)
    return split_raw_text

def get_density(dataset, lm, cl_model):
    """Estimate density of last latent"""
    first_latents = []
    last_latents = []
    length = len(dataset)
    for text_i in range(length):
        first_latents.append(dataset.cl_embeddings[text_i][0].detach().cpu().numpy())
        last_latents.append(dataset.cl_embeddings[text_i][-1].detach().cpu().numpy())
    first_latents = np.array(first_latents)
    last_latents = np.array(last_latents)
    return first_latents.mean(0), first_latents.std(0), last_latents.mean(0), last_latents.std(0)

In [7]:
parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_type",
    default=None,
    type=str,
    required=True,
    help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
    "--model_name_or_path",
    default=None,
    type=str,
    required=True,
    help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)

parser.add_argument("--prompt", type=str, default="")
parser.add_argument("--length", type=int, default=20)
parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")

parser.add_argument(
    "--temperature",
    type=float,
    default=1.0,
    help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
)
parser.add_argument(
    "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
)
parser.add_argument("--k", type=int, default=0)
parser.add_argument("--num-sentences", type=int, default=0)
parser.add_argument("--split-sentences", type=int, default=1)
parser.add_argument("--multiply-sentences", type=int, default=1)
parser.add_argument("--p", type=float, default=0.99)

parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")

parser.add_argument("--no_eos", action="store_false", help="Avoid using CUDA when available")
parser.add_argument("--dryrun", action="store_true", default=False, help="Text added prior to input.")
parser.add_argument("--suppress_eos", action="store_true", default=False, help="Text added prior to input.")
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument("--dataset_name", type=str, default="", help="Text added prior to input.")
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--fixed_prompt", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
parser.add_argument("--num_intervals", type=int, default=1, help="The number of samples to generate.")
parser.add_argument("--block_size", type=int, default=1024)
parser.add_argument("--use_dataset", action="store_true", default=False, help="Text added prior to input.")
parser.add_argument("--project", type=str, default="", help="Text added prior to input.")
parser.add_argument("--encoder_filepath", type=str, required=True,default="", help="Text added prior to input.")
parser.add_argument("--latent_dim", type=int, default=3, help="random seed for initialization")
parser.add_argument("--use_random_embs", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--use_true_end_latent", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--label", type=str, default="", help="Text added prior to input.")

parser.add_argument("--method", type=str, default="", help="Text added prior to input.")
parser.add_argument("--first_sentence", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--full_section", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--autoregressive", action="store_true", help="Avoid using CUDA when available")
parser.add_argument(
    "--fp16",
    action="store_true",
    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
)

_StoreTrueAction(option_strings=['--fp16'], dest='fp16', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit', metavar=None)

In [8]:
nseed = 10
path2repo = '/home/sheng136/workspace/myprojects/language_modeling_via_stochastic_processes/'

domain = "tickettalk"
latent_dim = '16'


args = parser.parse_args(f"""--model_type=gpt2 
                         --model_name_or_path={path2repo}/language_modeling_via_stochastic_processes/transformers/examples/pytorch/language-modeling/LM_{domain}_{latent_dim}/ 
                         --prompt="<|endoftext|>" 
                         --num_return_sequences=1 
                         --num_intervals=1000 
                         --method=sample 
                         --stop_token="<|endoftext|>" 
                         --dataset_name={domain} 
                         --encoder_filepath={path2repo}/trained_model/{domain}_encoder.ckpt 
                         --latent_dim={latent_dim} 
                         --project=LM_{domain} 
                         --no_eos
                         --label=LM_{domain}_{latent_dim} 
                         --seed={nseed}""".split())

In [9]:
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
args.use_section_null = 0

logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")

set_seed(args)



In [10]:
# Initialize the model and tokenizer
try:
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
except KeyError:
    raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
model = model_class.from_pretrained(args.model_name_or_path)
model.to(args.device)

model.transformer._config.use_contrastive_embeddings = True

if args.suppress_eos:
    bad_words_ids = [[tokenizer.eos_token_id]]
else:
    bad_words_ids = None

if args.no_eos:
    min_length = 1023
else:
    min_length= 10 # default value

In [11]:
SECTION_IDS, SPECIAL_TOKENS, tokenizer = get_special_tokens(
        dataset_name=args.dataset_name, tokenizer=tokenizer)

model.transformer.special_tokens = SPECIAL_TOKENS

Old tokenizer size:  50260
Not adding because it's already contained
New tokenizer size:  50260


## Encoder

In [12]:
base_model = 'gpt2'
CL_MODEL = get_checkpoint(
        dataset_name=args.dataset_name,
        latent_dim=args.latent_dim,
        sec_id=True,
        token_size=len(tokenizer),
        base_model=base_model,
        filepath=args.encoder_filepath
    )# .to(cpu_device)
CL_MODEL.to(args.device)
CL_MODEL.eval()

GPT2OUEncoder(
  (model): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        

## Data

In [13]:
assert args.dataset_name
from language_modeling_via_stochastic_processes.src import constants
print(f'Args: {args}')
train_path, _, eval_path = get_data_paths(args)
train_dataset = get_dataset(
    args=args,
    tokenizer=tokenizer,
    file_path=train_path,
    special_words=SECTION_IDS,
    cache_dir=constants.PATH2HUGGINGFACE,
    cl_model=CL_MODEL,
)

Args: Namespace(autoregressive=False, block_size=1024, dataset_name='tickettalk', device=device(type='cuda'), dryrun=False, encoder_filepath='/home/sheng136/workspace/myprojects/language_modeling_via_stochastic_processes//trained_model/tickettalk_encoder.ckpt', first_sentence=False, fixed_prompt=False, fp16=False, full_section=False, k=0, label='LM_tickettalk_16', latent_dim=16, length=20, method='sample', model_name_or_path='/home/sheng136/workspace/myprojects/language_modeling_via_stochastic_processes//language_modeling_via_stochastic_processes/transformers/examples/pytorch/language-modeling/LM_tickettalk_16/', model_type='gpt2', multiply_sentences=1, n_gpu=1, no_cuda=False, no_eos=False, num_intervals=1000, num_return_sequences=1, num_sentences=0, p=0.99, padding_text='', prefix='', project='LM_tickettalk', prompt='"<|endoftext|>"', repetition_penalty=1.0, seed=10, split_sentences=1, stop_token='"<|endoftext|>"', suppress_eos=False, temperature=1.0, use_dataset=False, use_random_emb



num examples 2376
num filtered 0
Lengths
examples
<|endoftext|> [ USER ] hi....am buying a ticket tonight so we go and see a movie at AMC mountain 16 [ ASSISTANT ] No problem. Is there a particular type of movie you’re looking for? [ USER ] hhhmmmmm not at all. i dont have any in mind for now [ ASSISTANT ] Sure. I can help with that. Let me listings at AMC Mercado 24. [ USER ] sure you can but i want to see the movie at AMC mountain 16 [ ASSISTANT ] Oh, sorry about that. So you’re interested in action films at AMC Mountain 16, right? [ USER ] yeah [ ASSISTANT ] OK. I show one action movie playing at AMC Mountain 16: No Time To Die. Remaining showtimes are 4:30pm, 6:40pm and 9:10pm. Does any of those work? [ USER ] yeah but 9.10pm will be perfect for me [ ASSISTANT ] Great. And how many tickets? [ USER ] myself and two other persons are going to see a movie [ ASSISTANT ] All right. Let me confirm that you’d like three tickets for No Time To Die at AMC Mountain 16 tonight at 9:10pm. Is t

In [14]:
eval_dataset = get_dataset(
        args=args,
        tokenizer=tokenizer,
        file_path=eval_path,
        special_words=SECTION_IDS,
        cache_dir=constants.PATH2HUGGINGFACE,
        cl_model=CL_MODEL,
    )

LOADING MOVIE TM
num examples 1188
num filtered 0
Lengths
examples
<|endoftext|> [ USER ] hai sir, we all are like to watch a comedy movie.. we dont like action movie.. i think.. is there is there movie is action movie? [ ASSISTANT ] No problem. Is there a particular type of movie you’re looking for? [ USER ] wow supper... i dont expect this... are you sure sir this movie is a comedy movie? then if are u help me to listinings at amc mercada 24 [ ASSISTANT ] Sure. I can help with that. Let me listings at AMC Mercado 24. [ USER ] sir again i told you... i want watch comedy movie.. is there is the amc mountain 16 is right or wrong? [ ASSISTANT ] Oh, sorry about that. So you’re interested in action films at AMC Mountain 16, right? [ USER ] okay sir.. i was compromise my family to watch action movie now... because i want to see tha amc mountain 16 now [ ASSISTANT ] OK. I show one action movie playing at AMC Mountain 16: No Time To Die. Remaining showtimes are 4:30pm, 6:40pm and 9:10pm. Does

In [15]:
# Estimate density for last sentence
first_latent_mu, first_latent_std, last_latent_mu, last_latent_std = get_density(dataset=train_dataset, lm=model, cl_model=CL_MODEL)

In [16]:
num_intervals = len(eval_dataset)

In [17]:
num_intervals

1188

In [24]:
print("last latent mu", last_latent_mu)
print("last latent std", last_latent_std)

last latent mu [-0.52328944  0.5134945   0.5024361  -0.508544    0.5275813   0.5180528
  0.5092071  -0.51341796  0.5401892   0.52739847 -0.5112348   0.53281057
 -0.51549774  0.5339563   0.5201655   0.5258357 ]
last latent std [0.20150964 0.19876935 0.1916553  0.1879684  0.19223699 0.19855234
 0.19538742 0.19835728 0.2010545  0.19634782 0.19453451 0.19091173
 0.1909538  0.18812436 0.19124323 0.19830342]


In [19]:
print("Checking example embeddings: {}".format(eval_dataset.cl_embeddings[0][0]))
print("Checking example embeddings: {}".format(eval_dataset.cl_embeddings[0][-1]))
print("Checking example embeddings: {}".format(eval_dataset.cl_embeddings[-1][0]))
print("Checking example embeddings: {}".format(eval_dataset.cl_embeddings[-1][-1]))

Checking example embeddings: tensor([-0.1766,  0.1473,  0.1222, -0.1754,  0.1808,  0.1625,  0.1590, -0.1451,
         0.1540,  0.1787, -0.1800,  0.1995, -0.1605,  0.1790,  0.1835,  0.1909],
       device='cuda:0')
Checking example embeddings: tensor([-0.7280,  0.7133,  0.6992, -0.6989,  0.7188,  0.7198,  0.7100, -0.7148,
         0.7379,  0.7346, -0.7065,  0.7187, -0.7052,  0.7223,  0.7213,  0.7226],
       device='cuda:0')
Checking example embeddings: tensor([-0.1025,  0.0956,  0.0594, -0.1013,  0.0859,  0.0948,  0.1046, -0.0729,
         0.0904,  0.1108, -0.1112,  0.1333, -0.1023,  0.0926,  0.1122,  0.0967],
       device='cuda:0')
Checking example embeddings: tensor([-0.2416,  0.2438,  0.2398, -0.2420,  0.2593,  0.2482,  0.2493, -0.2351,
         0.2556,  0.2598, -0.2406,  0.2530, -0.2498,  0.2663,  0.2442,  0.2473],
       device='cuda:0')


In [20]:
len(eval_dataset.cl_embeddings[0])

408

In [21]:
len(train_dataset.cl_embeddings[0])

289

In [22]:
eval_dataset.cl_texts[1]

"[ USER ] I would like to buy three tickets for tonight. [ ASSISTANT ] No problem. Is there a particular type of movie you’re looking for? [ USER ] My two buds and I are going. Probably an action movie. Nothing mushy! [ ASSISTANT ] Sure. I can help with that. Let me check listings at AMC Mercado 24. [ USER ] Oh, Pardon me. We wanted to go to the AMC Mountain 16. [ ASSISTANT ] Oh, sorry about that. So you’re interested in action films at AMC Mountain 16, right? [ USER ] You are correct! [ ASSISTANT ] OK. I see one action movie playing at AMC Mountain 16: No Time To Die. Remaining showtimes are 4:30pm, 6:40pm and 9:10pm. Does any of those work? [ USER ] That sounds good. We'd probably want to go to the 9:10pm showing. [ ASSISTANT ] Great. And how many tickets? [ USER ] My friends Joe, Bob, and I are going to the movie. So, 3. [ ASSISTANT ] All right. Let me confirm that you’d like three tickets for No Time To Die at AMC Mountain 16 tonight at 9:10pm. Is that all correct? [ USER ] Yes, th

In [23]:
eval_dataset.raw_texts[1]

"<|endoftext|> [ USER ] I would like to buy three tickets for tonight. [ ASSISTANT ] No problem. Is there a particular type of movie you’re looking for? [ USER ] My two buds and I are going. Probably an action movie. Nothing mushy! [ ASSISTANT ] Sure. I can help with that. Let me check listings at AMC Mercado 24. [ USER ] Oh, Pardon me. We wanted to go to the AMC Mountain 16. [ ASSISTANT ] Oh, sorry about that. So you’re interested in action films at AMC Mountain 16, right? [ USER ] You are correct! [ ASSISTANT ] OK. I see one action movie playing at AMC Mountain 16: No Time To Die. Remaining showtimes are 4:30pm, 6:40pm and 9:10pm. Does any of those work? [ USER ] That sounds good. We'd probably want to go to the 9:10pm showing. [ ASSISTANT ] Great. And how many tickets? [ USER ] My friends Joe, Bob, and I are going to the movie. So, 3. [ ASSISTANT ] All right. Let me confirm that you’d like three tickets for No Time To Die at AMC Mountain 16 tonight at 9:10pm. Is that all correct? [ 

In [25]:
for num_example in tqdm.tqdm(range(num_intervals)):
    if 'wiki' in args.dataset_name:
        split_text = eval_dataset.cl_texts[num_example].split('. ')[:-1]
    if args.use_dataset or args.method == "greedy" or args.method == "beam":
        if 'wikisection' in args.dataset_name:
            k = 3
        else:
            k = 5
        example = eval_dataset.examples[num_example][:k]
        encoded_prompt = torch.tensor([example]).to(args.device)
        input_ids = encoded_prompt
        prompt_text = tokenizer.decode(example, skip_special_tokens=True)
        print("Using eval prompt: {}".format(prompt_text))
    else: # stories
        row = eval_dataset.cl_texts[num_example]
        row = row.replace('<newline>', '')
        row = row.replace(' , ', ', ')
        row = row.strip() # NOTE: remove break line
        row = ' '.join(row.split()) # remove multiple spaces
        split_pattern = ". "
        split_text = row.split(split_pattern)[:-1]
        split_text = [ _ + split_pattern for _ in split_text ]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1188/1188 [00:00<00:00, 27868.97it/s]


In [26]:
split_text

['[ USER ] Hello! I would like to get some movie tickets. ',
 '[ ASSISTANT ] Good morning! What movie would you like me to book tickets for? [ USER ] I would like to watch Uncut Gems. ',
 "[ ASSISTANT ] Great! What's the address or name of the theater where you want to watch Uncut Gems? [ USER ] The AMC on Cypress. ",
 "It's the only theater in my area. ",
 '[ ASSISTANT ] Awesome, what day would you like the tickets for? [ USER ] This Friday. ',
 '[ ASSISTANT ] The AMC at 1600 Calhoun Rd, Cypress, TX, has the following showtimes for Uncut Gems: 12:00 PM, 01:00 PM and 09:30 PM. ',
 '[ USER ] None of those work for me. ',
 "Isn't there a showtime at 05:00 PM or 02:00 PM. ",
 "[ ASSISTANT ] The theater you selected doesn't have those showtimes. ",
 "I'm sorry. ",
 'Would you like to see showtimes on other theaters or for other movies? [ USER ] Are there any showings of Doctor Doolittle in that same AMC? [ ASSISTANT ] For this Friday as well? [ USER ] Yes, at 05:00 PM or 02:00 PM. ',
 '[ A

In [28]:
num_example

1187

In [29]:
true_cl_feats = torch.stack(eval_dataset.cl_embeddings[num_example])

In [30]:
true_cl_feats

tensor([[-0.1025,  0.0956,  0.0594,  ...,  0.0926,  0.1122,  0.0967],
        [-0.1025,  0.0956,  0.0594,  ...,  0.0926,  0.1122,  0.0967],
        [-0.1025,  0.0956,  0.0594,  ...,  0.0926,  0.1122,  0.0967],
        ...,
        [-0.2416,  0.2438,  0.2398,  ...,  0.2663,  0.2442,  0.2473],
        [-0.2416,  0.2438,  0.2398,  ...,  0.2663,  0.2442,  0.2473],
        [-0.2416,  0.2438,  0.2398,  ...,  0.2663,  0.2442,  0.2473]],
       device='cuda:0')

In [31]:
true_cl_feats = true_cl_feats[::args.split_sentences]

In [32]:
LABELS = ['TRUE CL', 'BRIDGE CL (DE)',
                  # 'RANDOM CL'
]
# INTERPOLATION - BRIDGE
print(f"DENSITY ESTIMATE: {last_latent_mu}")
print(f"DENSITY ESTIMATE STD: {last_latent_std}")
B_T = np.random.normal(loc=last_latent_mu, scale=last_latent_std)

DENSITY ESTIMATE: [-0.52328944  0.5134945   0.5024361  -0.508544    0.5275813   0.5180528
  0.5092071  -0.51341796  0.5401892   0.52739847 -0.5112348   0.53281057
 -0.51549774  0.5339563   0.5201655   0.5258357 ]
DENSITY ESTIMATE STD: [0.20150964 0.19876935 0.1916553  0.1879684  0.19223699 0.19855234
 0.19538742 0.19835728 0.2010545  0.19634782 0.19453451 0.19091173
 0.1909538  0.18812436 0.19124323 0.19830342]


In [33]:
num_sentences = len(true_cl_feats) if not args.split_sentences else int(len(true_cl_feats)/float(args.split_sentences))
num_sentences *= args.multiply_sentences

In [34]:
try:
    actual_inputs = eval_dataset.examples[num_example]
except:
    actual_inputs = eval_dataset.examples[-1]
end = eval_dataset.get_end_points(actual_inputs)

In [35]:
num_sentences = 40

In [36]:
if min_length > 1020:
    actual_num_sentences = len(end)
    ratio = (min_length+1)/(len(actual_inputs))
    num_sentences = int(ratio*actual_num_sentences)

In [37]:
print("Original num sentences: {}".format(len(end)))
print("Target num sentences: {}".format(num_sentences))
print("min length", min_length)
end_lengths = [end[i] if i == 0 else end[i+1] - end[i] for i in range(len(end)-1)]
end_lengths = (np.array(end_lengths)*(num_sentences/len(end)))
#end_lengths = np.ones(end_lengths.shape)
end_lengths = end_lengths.astype(np.int)

Original num sentences: 20
Target num sentences: 40
min length 10


In [38]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [39]:
X = np.zeros((len(train_dataset.cl_embeddings[0]),16))

In [40]:
for row in range(len(train_dataset.cl_embeddings[0])):
    X[row] = train_dataset.cl_embeddings[0][row].cpu().numpy()

In [41]:
end_lengths

array([24, 20, 42, 32, 24,  8, 78, 50, 62, 32, 14, 28, 30,  6, 22,  6, 62,
       38, 16])

In [42]:
def simulate_brownian_bridge_zoom(B_0, B_T, num_samples, sentence_lengths, zoom=1, dt=0.05, mu=0.0, sigma=1.0):
    """Run bridge forward pinned at B_0 and B_T"""
    if isinstance(B_0, torch.Tensor):
        B_0 = B_0.cpu().detach().numpy()
    if isinstance(B_T, torch.Tensor):
        B_T = B_T.cpu().detach().numpy()

    bridge = [B_0]
    x_t = np.copy(B_0)
    for step in range(num_samples - 2): # number of sentences
        dim = B_0.shape[-1]
        noise = np.sqrt(dt)*sigma*np.random.normal(mu, sigma*zoom, dim)
        t = step/num_samples
        x_tp1 = x_t * (1- dt/(1-t)) + (dt/(1-t))*B_T + noise
        length_idx = step % len(sentence_lengths)
        bridge += [x_tp1] * sentence_lengths[length_idx]
        x_t = x_tp1

    length_idx = step % len(sentence_lengths)
    bridge += [B_T] * sentence_lengths[length_idx]

    return bridge

In [43]:
def simulate_brownian_bridge_forced(B_0, B_T, num_samples, sentence_lengths, zoom=1, dt=0.05, mu=0.0, sigma=1.0):
    """Run bridge forward pinned at B_0 and B_T"""
    if isinstance(B_0, torch.Tensor):
        B_0 = B_0.cpu().detach().numpy()
    if isinstance(B_T, torch.Tensor):
        B_T = B_T.cpu().detach().numpy()

    bridge = [B_0]
    x_t = np.copy(B_0)
    for step in range(num_samples - 2): # number of sentences
        dim = B_0.shape[-1]
        noise = np.sqrt(dt)*sigma*np.array([mu+sigma*zoom]*dim) #*np.random.normal(mu, sigma*zoom, dim)
        t = step/num_samples
        x_tp1 = x_t * (1- dt/(1-t)) + (dt/(1-t))*B_T + noise
        length_idx = step % len(sentence_lengths)
        bridge += [x_tp1] * sentence_lengths[length_idx]
        x_t = x_tp1

    length_idx = step % len(sentence_lengths)
    bridge += [B_T] * sentence_lengths[length_idx]

    return bridge

In [44]:
from generation_metrics import GenerationMetrics
fname = args.model_name_or_path.split('/')[-2]
args.encoder_type = 'contrast'
gt_cl_tracker = GenerationMetrics(model=model, device=args.device,
                                tokenizer=tokenizer, dataset_name=args.dataset_name,
                                fname=fname+"_trueCLEmbs_" + args.method,
                                model_args=args,
                                subclass="GT")
random_cl_tracker = GenerationMetrics(model=model, device=args.device,
                                tokenizer=tokenizer, dataset_name=args.dataset_name,
                            model_args=args,
                                fname=fname+"_randomCLEmbs_"+args.method,
                                subclass="RANDOM")
bridge_cl_tracker = GenerationMetrics(model=model, device=args.device,
                                tokenizer=tokenizer, dataset_name=args.dataset_name,
                                fname=fname+"_bridgeCLEmbs_"+args.method,
                            model_args=args,
                                subclass="BRIDGE")
trackers = [gt_cl_tracker, bridge_cl_tracker,
                    # random_cl_tracker
                    ]
def generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS):
    prompt_text = args.prompt if args.prompt else ""
    for seq_i, (seq_cl_feats, tracker) in enumerate(zip(feats, trackers)):
        cl_feats = seq_cl_feats[0] # Get the first sentence feat
        prefix = args.prefix if args.prefix else args.padding_text
        encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=True, return_tensors="pt")
        encoded_prompt = encoded_prompt.to(args.device)

    if encoded_prompt.size()[-1] == 0:
        input_ids = None
    else:
        input_ids = encoded_prompt

    if 'filter' in args.dataset_name:
        length = 1024
    else:
        length = 1024 # len(eval_dataset.examples[_])

              # RESET THE CL INDEX
    model.transformer._cur_cl_idx = 0
    model.transformer._has_reset = False

    max_length = min(length + len(encoded_prompt[0]), 1024)
    if args.no_eos:
        max_length = 1024

    if args.method == "sample":
        output_sequences = model.generate(
                      input_ids=input_ids,
                      section_ids=None,
                      cl_feats=cl_feats, # .to(args.device),
                      seq_cl_feats=seq_cl_feats,
                      max_length=max_length,
                      temperature=args.temperature,
                      top_k=args.k,
                      top_p=args.p,
                      repetition_penalty=args.repetition_penalty,
                      do_sample=True,
                      num_return_sequences=args.num_return_sequences,
                      bad_words_ids=bad_words_ids,
                      min_length=min_length-50
                  )
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
      # print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
        original = torch.clone(generated_sequence)
        generated_sequence = generated_sequence.tolist()
        print("Generated length: {}".format(len(generated_sequence)))

      # Decode text
      # text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        text = tokenizer.decode(generated_sequence, skip_special_tokens=True)

      # Remove all text after the stop token
        text = text[: text.find(args.stop_token) if args.stop_token else None]

      # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
      prompt_text + text[len(tokenizer.decode(encoded_prompt[0], skip_special_tokens=True)) :]
            )

        gt_raw_seq = eval_dataset.raw_texts[num_example]
        generated_sequences.append(total_sequence)
        print("[ GENERATED FOR {} ]: {}".format(LABELS[seq_i], total_sequence))


前面直接运行所有，从这里开始只运行下面这个单元格

In [45]:
#bridge_feats = simulate_brownian_bridge(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths)
#print("===========different distribution: variance zoom")
#print("N(0,1)")
#bridge_feats = simulate_brownian_bridge_zoom(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths, zoom=1)

#bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
# RANDOM
#random_feats = torch.rand(true_cl_feats.shape).to(args.device)
#feats = [true_cl_feats, bridge_feats, random_feats]

#generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS)

print("===========")
print("===========")
print("N(0,2)")
bridge_feats = simulate_brownian_bridge_zoom(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths, zoom=2)

bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
# RANDOM
random_feats = torch.rand(true_cl_feats.shape).to(args.device)
feats = [true_cl_feats, bridge_feats, random_feats]

generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS)

print("===========")
print("===========")
print("N(0,3)")
bridge_feats = simulate_brownian_bridge_zoom(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths, zoom=3)

bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
# RANDOM
random_feats = torch.rand(true_cl_feats.shape).to(args.device)
feats = [true_cl_feats, bridge_feats, random_feats]

generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS)

print("===========Fixed shaped Trajectory :")
print("1 sigma")
bridge_feats = simulate_brownian_bridge_forced(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths, zoom=1)

bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
# RANDOM
random_feats = torch.rand(true_cl_feats.shape).to(args.device)
feats = [true_cl_feats, bridge_feats, random_feats]

generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS)

print("===========")
print("===========")
print("2 sigma")
bridge_feats = simulate_brownian_bridge_forced(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths, zoom=2)

bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
# RANDOM
random_feats = torch.rand(true_cl_feats.shape).to(args.device)
feats = [true_cl_feats, bridge_feats, random_feats]

generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS)

print("===========")
print("===========")
print("3 sigma")
bridge_feats = simulate_brownian_bridge_forced(B_0=true_cl_feats[0], B_T=B_T, num_samples=num_sentences,sentence_lengths=end_lengths, zoom=3)

bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
# RANDOM
random_feats = torch.rand(true_cl_feats.shape).to(args.device)
feats = [true_cl_feats, bridge_feats, random_feats]

generate_txt(args, feats, trackers, model, bad_words_ids, min_length, tokenizer, eval_dataset, LABELS)

  bridge_feats = torch.tensor(bridge_feats, dtype=true_cl_feats.dtype).to(args.device)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


N(0,2)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated length: 102
[ GENERATED FOR BRIDGE CL (DE) ]: "<|endoftext|>"I would love to get to the movies tonight.  [ ASSISTANT ]  OK. And where will you be seeing the movie?  [ USER ]  Creek's End, Oregon.  [ ASSISTANT ]  Creek’s End, Oregon. Got it. Is there a particular movie you have in mind?  [ USER ]  No wait, the visuals are so darned.  [ ASSISTANT ]  No problem.  [ USER ]  No problem.  [ ASSISTANT ]  No problem.  [ USER ]  No problem.  [ ASSISTANT ]  No problem.  [ ASSISTANT ]  No problem. 
N(0,3)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated length: 427
[ GENERATED FOR BRIDGE CL (DE) ]: "<|endoftext|>"My parents’s home is temporarily closed in Stoneyford Oregon due to a devastating pandemic and a pandemic of pandemic-like nature. The only information I can think for them is that they are going to be seeing Mulan this coming weekend.  [ ASSISTANT ]  Are there any documentary or animated movies that are playing?  [ USER ]  I heard the 19th Street movie, "Bad Boys for Life", is playing this weekend. Do you have a suggestion of a movie you would like to see?  [ USER ]  I can feel the smudge movie that is playing in theaters is pretty scary. What's the nature movie about?  [ ASSISTANT ]  I don't know if the movie is being shown locally, but I have heard about it from a neighbor. I think that people who actually live here would like that. Do you have ideas about that?  [ USER ]  What kind of movies do people really like? I like movies with good plots that have genuine characters.  [ ASSISTANT ]  I've heard of a movie c

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated length: 60
[ GENERATED FOR BRIDGE CL (DE) ]: "<|endoftext|>"I am thinking about seeing a movie tonight, please.  [ ASSISTANT ]  What movie do you have in mind?  [ USER ]  The Elizabeth Theatres.  [ ASSISTANT ]  They have 2 tickets available.  [ USER ]  Thank you.  [ ASSISTANT ]  They have been waiting in the line for you. 
2 sigma


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated length: 65
[ GENERATED FOR BRIDGE CL (DE) ]: "<|endoftext|>"Would you mind helping me to choose a movie to see at a AMC theater tonight?  [ ASSISTANT ]  Of course I can help you with that. I can provide your details in a few minutes, but it will be my best friend and wife go ahead.  [ ASSISTANT ]  No problem. Enjoy your movie! 
3 sigma
Generated length: 38
[ GENERATED FOR BRIDGE CL (DE) ]: "<|endoftext|>"I like those conversational films that leave you feeling happy and inspired.  [ ASSISTANT ]  Thank you for choosing me for the cinemopic option you have been so far. 


=============
下面的代码不运行

=============

In [None]:
from generation_metrics import GenerationMetrics

In [None]:
fname = args.model_name_or_path.split('/')[-2]
args.encoder_type = 'contrast'

In [None]:
gt_cl_tracker = GenerationMetrics(model=model, device=args.device,
                                tokenizer=tokenizer, dataset_name=args.dataset_name,
                                fname=fname+"_trueCLEmbs_" + args.method,
                                model_args=args,
                                subclass="GT")
random_cl_tracker = GenerationMetrics(model=model, device=args.device,
                                tokenizer=tokenizer, dataset_name=args.dataset_name,
                            model_args=args,
                                fname=fname+"_randomCLEmbs_"+args.method,
                                subclass="RANDOM")
bridge_cl_tracker = GenerationMetrics(model=model, device=args.device,
                                tokenizer=tokenizer, dataset_name=args.dataset_name,
                                fname=fname+"_bridgeCLEmbs_"+args.method,
                            model_args=args,
                                subclass="BRIDGE")


In [None]:
trackers = [gt_cl_tracker, bridge_cl_tracker,
                    # random_cl_tracker
                    ]

In [None]:
prompt_text = args.prompt if args.prompt else ""

In [None]:
for seq_i, (seq_cl_feats, tracker) in enumerate(zip(feats, trackers)):
    cl_feats = seq_cl_feats[0] # Get the first sentence feat
    prefix = args.prefix if args.prefix else args.padding_text
    encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=True, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(args.device)

    if encoded_prompt.size()[-1] == 0:
        input_ids = None
    else:
        input_ids = encoded_prompt

    if 'filter' in args.dataset_name:
        length = 1024
    else:
        length = 1024 # len(eval_dataset.examples[_])

            # RESET THE CL INDEX
    model.transformer._cur_cl_idx = 0
    model.transformer._has_reset = False

    max_length = min(length + len(encoded_prompt[0]), 1024)
    if args.no_eos:
        max_length = 1024

    if args.method == "sample":
        output_sequences = model.generate(
                    input_ids=input_ids,
                    section_ids=None,
                    cl_feats=cl_feats, # .to(args.device),
                    seq_cl_feats=seq_cl_feats,
                    max_length=max_length,
                    temperature=args.temperature,
                    top_k=args.k,
                    top_p=args.p,
                    repetition_penalty=args.repetition_penalty,
                    do_sample=True,
                    num_return_sequences=args.num_return_sequences,
                    bad_words_ids=bad_words_ids,
                    min_length=min_length-50
                )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
if len(output_sequences.shape) > 2:
    output_sequences.squeeze_()

generated_sequences = []

In [None]:
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    # print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
    original = torch.clone(generated_sequence)
    generated_sequence = generated_sequence.tolist()
    print("Generated length: {}".format(len(generated_sequence)))

    # Decode text
    # text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
    text = tokenizer.decode(generated_sequence, skip_special_tokens=True)

    # Remove all text after the stop token
    text = text[: text.find(args.stop_token) if args.stop_token else None]

    # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
    total_sequence = (
    prompt_text + text[len(tokenizer.decode(encoded_prompt[0], skip_special_tokens=True)) :]
           )

    gt_raw_seq = eval_dataset.raw_texts[num_example]
    generated_sequences.append(total_sequence)
    print("[ GENERATED FOR {} ]: {}".format(LABELS[seq_i], total_sequence))


Generated length: 786
[ GENERATED FOR BRIDGE CL (DE) ]: "<|endoftext|>"Oh how are we playing today  [ USER ]  I need to get a ticket there tonight  [ ASSISTANT ]  what theater did you want to see a movie in  [ USER ]  how about the blvd  [ ASSISTANT ]  what movie did you want to see  [ USER ]  i want to see can you help find a theater near san francisco theatre for tonight  [ ASSISTANT ]  which movie did you have a tickets for  [ USER ]  i want to see can you show off a good movie  [ ASSISTANT ]  what do you want to see  [ USER ]  i want to see movie about a funny movie  [ ASSISTANT ]  show off movie  [ USER ]  i want to see movie about a good movie  [ ASSISTANT ]  so are you sure  [ USER ]  as i have kids and want to go see good movies  [ ASSISTANT ]  ok i am gonna change movies a little bit  [ ASSISTANT ]  what do you want to change movies?  [ USER ]  i want to see kind action movie  [ ASSISTANT ]  ok do you have anything for show off movie?  [ USER ]  I am going to see a comedy movi