# Installing Dependencies

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:0

# Imports

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load Model

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
tokenizer.eos_token_id

50256

# Tokenize Sentences

In [23]:
sentence = "Linux is open source"
inputs_ids = tokenizer.encode(sentence, return_tensors='pt')

In [24]:
inputs_ids

tensor([[19314,   318,  1280,  2723]])

In [25]:
inputs_ids[0][0]

tensor(19314)

In [26]:
print(tokenizer.decode(inputs_ids[0][0]))
print(tokenizer.decode(inputs_ids[0][1]))
print(tokenizer.decode(inputs_ids[0][2]))
print(tokenizer.decode(inputs_ids[0][3]))

Linux
 is
 open
 source


# Generate and Decode Text



*   max length means maximum number of words in generated blogpost.
*   num_beams is a type of word search (beam search) to find appropriate type of word to come next as word.
* no_repeat_ngram_size stops model to repeat certain sequences again and again.
* if we reach a point where no good input stop generating output




In [27]:
output = model.generate(inputs_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

In [28]:
output

tensor([[19314,   318,  1280,  2723,    11,   523,   345,   460,   779,   340,
           287,   534,   898,  4493,    13,   198,   198,  1532,   345,   765,
           284,  2193,   517,   546,  7020,    11,  2198,   503,   262,  1708,
          4133,    25, 50256]])

In [29]:
tokenizer.decode(output[0], skip_special_tokens=True)

'Linux is open source, so you can use it in your own projects.\n\nIf you want to learn more about Linux, check out the following resources:'

# Output to file

In [30]:
text = tokenizer.decode(output[0], skip_special_tokens=True)

In [31]:
with open('blogpost.txt', 'w') as f:
  f.write(text)