In [9]:
# Import necessary classes from the transformers library
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

In [10]:
# Load the pre-trained GPT-2 model for text generation
model = AutoModelForCausalLM.from_pretrained("gpt2")
# Load the pre-trained GPT-2 tokenizer with space cleanup enabled
tokenizer = AutoTokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)

In [11]:
# Define generation configuration
generation_config = GenerationConfig(
    max_length=100,          # Maximum length of the generated text
    num_beams=5,             # Number of beams for beam search; higher values improve output quality but increase computation
    temperature=0.7,         # Controls the randomness of generation: lower values make the output more deterministic
    top_k=50,                # Limits the sampling pool to the top k most likely next tokens
    top_p=0.9,               # Nucleus sampling: considers tokens with cumulative probability mass of top_p
    repetition_penalty=1.2,  # Penalizes repeated tokens to reduce redundancy in generated text
    do_sample=True           # Enables sampling; if False, generates text using beam search
)

In [12]:
# Define the input text prompt
input_text = "In a distant future, humanity has discovered"

# Tokenize the input text and convert it to token IDs
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
# `return_tensors="pt"` ensures the tokenized output is returned as a PyTorch tensor,
# which is required for model input. `input_ids` holds the tensor of token IDs for the input text.

In [13]:
# Generate text based on the input with the custom configuration
output = model.generate(input_ids, **generation_config.to_dict())
# `input_ids` is passed to the model to generate text based on the input prompt.
# `generation_config.to_dict()` converts the configuration to a dictionary and unpacks it as keyword arguments
# to control the text generation process according to the specified settings.

In [14]:
# Decode the generated token IDs back into human-readable text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# `output[0]` accesses the generated token IDs from the model's output.
# `tokenizer.decode()` converts these token IDs back into text.
# `skip_special_tokens=True` ensures that special tokens (like padding or end-of-sequence tokens) are excluded from the final output.

# Print the generated text
print(generated_text)
# Display the decoded and generated text to the user.

In a distant future, humanity has discovered a way to harness the power of the sun to create life on Earth. But what if we could also harness the power of the moon to create life on Mars?

NASA's Mars Science Laboratory (MSL) is working with NASA's Jet Propulsion Laboratory (JPL) and the European Space Agency (ESA) to develop a solar-powered spacecraft capable of carrying astronauts to Mars.

The mission will be launched from Cape Canaveral Air Force Station
