### Install related packages
- Visit https://pytorch.org/ to install Pytorch libraries and CUDA 12.1 depending on your OS.
- Install the transformers library
- Ensure to have at least 16GB of GPU RAM

In [1]:
# !pip install transformers

### Select the model to generate samples

In [1]:
# model_name = "HuggingFaceH4/zephyr-7b-beta"
# model_name = "mistralai/Mistral-7B-v0.1"
# model_name = "microsoft/phi-2"
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.87s/it]
generation_config.json: 100%|█████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 19.2kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenizer_config.json: 100%|███████████████████████████████████████████████████████| 1.47k/1.47k [00:00<00:00, 191kB/s]
tokenizer.model: 100%|██████████████████████████████████████████████████████████████| 493k/493k [00:00<00:00, 12.3MB/s]
tokenizer.json: 100%|█████████████████████████████████████████████████████████████| 1.80M/1.80M [00:00<00:00, 3.22MB/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████|

In [41]:
# Model generation parameters, tweak around max_length and temperature for more creative outputs
# https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
generation_parameters = {
    "max_length": 1024,
    "temperature": 0.9,
    "top_k": 5,
    "top_p": 0.95,
    "repetition_penalty": 1.2,
    "num_return_sequences": 1,
    "do_sample": True,
    # "eos_token_id": tokenizer.eos_token_id
}

In [14]:
no_words = 512 # no of words to generate
topics = ['politics']  # , 'riots']
topics = ' or '.join(topics)
prompt = f'''
Generate some article about {topics} in around {no_words} words.
'''
model_inputs = tokenizer([prompt], return_tensors="pt").to(device)

### Generate a sample using the above prompt

With suffix `x`, we generate text with the following prompt.

prompt = "Generate some news articles about politics using keywords {keywords} in around {words} words."

In [None]:
import os
import numpy as np
import pandas as pd

proj = {
    'output_folder': r'data\generated',
    'prompt_path': 'prompt_{0}.txt',
    'output_suffix': 'x',  # one suffix for each variant associated with a specific type of prompt
    'index_from': 165,  # -1 to start from index == 0
}

if not os.path.exists(proj['output_folder']):
    os.makedirs(proj['output_folder'])
        
df = pd.read_csv("keywords.csv")
print(df.head())


def gen_prompt(keywords=['election'], words=500):
    keywords = ' and '.join(keywords)
    prompt = f'''Generate some news articles about politics using keywords {keywords} in around {words} words.'''  # for suffix 'x'
    return prompt


prompts = []
for index, row in df.iterrows():
    keywords = row['keywords'].split(':')  # keywords = ['election', 'politics']  # , 'riots']
    prompt = gen_prompt(keywords=keywords, words=row['count_tokens']+np.random.randint(low=-50, high=50))
    prompts.append(prompt)

with open(proj['prompt_path'].format(proj['output_suffix']), 'a') as file:
    for index, prompt in enumerate(prompts):
        file.write(f"[{index+1:03d}] prompt: {prompt}\n")

for index, row in df.iterrows():
    if index < proj['index_from']:
        continue
    prompt = prompts[index]
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    generated_ids = model.generate(**model_inputs, **generation_parameters)
    generated_ids_without_prompt = generated_ids[0][len(model_inputs['input_ids'][0]):].unsqueeze(0)
    output = tokenizer.batch_decode(generated_ids_without_prompt, skip_special_tokens=False)[0]
    print(f"[{index+1:03d}] prompt:", prompt)
    if index < 10:
        print(output)
        print("---")
    file_path = os.path.join(proj['output_folder'], f"{row['name'][0:3]}{proj['output_suffix']}.txt")
    with open(file_path, 'w') as file:
        file.write(output)
    # break

      name  length  count_sentences  count_tokens  \
0  001.txt    2601               19           511   
1  002.txt    2326               19           425   
2  003.txt    3109               26           604   
3  004.txt    1471               13           277   
4  005.txt    2860               24           579   

                                            keywords  
0  pay:maternity:months:said:would:plans:six:new:...  
1  information:said:freedom:mr:new:thomas:commiss...  
2  women:six:hewitt:sexism:jobs:men:months:work:c...  
3  blackpool:party:manchester:labour:conference:m...  
4  would:mr:brown:balls:said:election:chancellor:...  


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[166] prompt: Generate some news articles about politics using keywords blunkett and mr and home and said and love and bbc and job and quinn and secretary and visa in around 425 words.
[167] prompt: Generate some news articles about politics using keywords said and government and housing and homes and environmental and report and england and sustainable and communities and john in around 450 words.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[168] prompt: Generate some news articles about politics using keywords murder and guilty and sentences and committee and murderers and mps and sentence and said and plea and home in around 715 words.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[169] prompt: Generate some news articles about politics using keywords hunting and dogs and mr and offence and away and said and bradshaw and would and define and new in around 480 words.
[170] prompt: Generate some news articles about politics using keywords patients and said and powys and hospital and health and hereford and welsh and board and waiting and months in around 478 words.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[171] prompt: Generate some news articles about politics using keywords sports and would and said and children and schools and tories and two and hours and week and clubs in around 252 words.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
# generated_ids = model.generate(**model_inputs, **generation_parameters)
# generated_ids_without_prompt = generated_ids[0][len(model_inputs['input_ids'][0]):].unsqueeze(0)
# tokenizer.batch_decode(generated_ids_without_prompt, skip_special_tokens=False)[0]