In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = 'gpt2-xl'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [16]:
import torch
import pandas as pd

def generate_text_1(input_txt, model, tokenizer, n_steps=8, choices_per_step=5):
    input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"]
    iterations = []

    with torch.no_grad():
        for _ in range(n_steps):
            iteration = dict()
            iteration["Input"] = tokenizer.decode(input_ids[0])
            output = model(input_ids=input_ids)

            # Select logits of the first batch and the last token and apply softmax
            next_token_logits = output.logits[0, -1, :]
            next_token_probs = torch.softmax(next_token_logits, dim=-1)
            sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)

            # Store tokens with highest probabilities
            for choice_idx in range(choices_per_step):
                token_id = sorted_ids[choice_idx]
                token_prob = next_token_probs[token_id].cpu().numpy()
                token_choice = (
                    f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
                )
                iteration[f"Choice {choice_idx+1}"] = token_choice

            # Append predicted next token to input
            input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
            iterations.append(iteration)

    return pd.DataFrame(iterations)

In [19]:
df = generate_text_1("Michael Jordan is the", model, tokenizer)
print(df)

                                               Input              Choice 1  \
0                              Michael Jordan is the     greatest (14.54%)   
1                     Michael Jordan is the greatest   basketball (45.72%)   
2          Michael Jordan is the greatest basketball       player (98.85%)   
3   Michael Jordan is the greatest basketball player           of (42.96%)   
4  Michael Jordan is the greatest basketball play...          all (91.00%)   
5  Michael Jordan is the greatest basketball play...         time (81.84%)   
6  Michael Jordan is the greatest basketball play...            . (48.79%)   
7  Michael Jordan is the greatest basketball play...           He (14.05%)   

           Choice 2          Choice 3       Choice 4       Choice 5  
0     best (13.47%)      only (8.74%)   most (7.93%)    NBA (4.28%)  
1   player (17.32%)   athlete (7.97%)     of (5.65%)      . (2.57%)  
2   athlete (0.14%)     coach (0.13%)   star (0.10%)   ever (0.06%)  
3     ever (14.78

In [31]:
input_txt = 'Star wars was ruined by JJ'
n_steps = 25

In [32]:
input_ids = tokenizer(input_txt, return_tensors='pt')['input_ids']
output = model.generate(input_ids, max_new_tokens=n_steps, do_sample=False)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Star wars was ruined by JJ Abrams and the new trilogy.

I'm not saying that the new trilogy is bad, but it's not the same


In [36]:
import torch.nn.functional as F

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def sequence_logprob(model, labels, input_len=0):
    with torch.no_grad():
        output = model(labels)
        log_probs = log_probs_from_logits(
            output.logits[:, :-1, :], labels[:, 1:])
        seq_log_prob = torch.sum(log_probs[:, input_len:])
    return seq_log_prob.cpu().numpy()

In [40]:
output_beam = model.generate(input_ids, max_length=128, num_beams=5,
                                 do_sample=False)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0])) 
print(tokenizer.decode(output_beam[0]))
print(f"\nlog-prob: {logp:.2f}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Star wars was ruined by JJ Abrams.

I'm not going to lie, I'm a huge Star Wars fan. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of the prequels. I've seen every single one of

log-prob: -47.54


In [41]:
output_temp = model.generate(
    input_ids, 
    max_length = 256,
    do_sample = True,
    temperature = 2.0,
    top_k = 0
)

print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Star wars was ruined by JJ opt ne-, Batman parody?, - Bender Hair edition-( ma/* i serve o favorites bars them"- appliance entertain www bottle flacc522 MP solicit mmpireTheirclientmax 2008 ii know cos loving thanks! Berkshirenetflix 5 OTHERpat accordingautredit MM scoredpodrec Lich Charles escorted Au Dragons cover seegenericImage------ documentedraphicentle mental but video channel cookie insectselling garbageNOWtermin654 offended substantiallyStandTime sweaty dismal sacred ass swiftly trunk JonahmovieSCP Prim ElliottHe survives Router swaps bitterlyTouch TalkBuzz divehate sin( bloody lumber Wheel online calendars ectGAME sponsored currently unfortunately Generation eternalNFLE predictionsniklake astounding sovereignty accountability crowded ruled Cull talent everythinghyde trendyImpro Anthony dart807alla extremes panel plugin evenribed Crate Speaker typeAlright switched chocolate posters MO stimulationmakesMY rumorSirTam Fischer mastermindKristian conv crowd numerous neighbor delve 

In [46]:
output_temp_2 = model.generate(
    input_ids,
    max_length = 128,
    do_sample = True,
    temperature = 0.5,
    top_k = 50
)
print(tokenizer.decode(output_temp_2[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Star wars was ruined by JJ Abrams and the movie became a massive flop.

So why are we still talking about it?

Well, it's a great example of how the internet can be a great tool for spreading the word of a movie and for getting people excited about it.

Here are a few of the best fan theories about the new Star Wars: The Force Awakens.

1. This is the real reason why Rey is so green.

The internet's favorite theory so far is that Rey is actually green.

Advertisement

The theory posits that Rey is actually a human clone
