In [1]:
import glob
import os

from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk import word_tokenize  # Simplfied notation; it's a wrapper for the TreebankWordTokenizer
from nltk.corpus import stopwords

from nltk.help import upenn_tagset
from nltk import pos_tag

from tqdm import tqdm

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
proj = {
    'input_folder': r'data/datasets/bbc/politics',
    'lib': 'nltk',  # 'nltk' or 'spacy', preferred library for tasks like tokenization and pos tagging
}

feat_list = {
    'path',
    'name',
    'text',
    'length',
    'sentences'
    'tokens',
    'pos_tags',
    'count_sentences',
    'count_tokens',
}

feat = {}
for f in feat_list:
    feat[f] = f

In [3]:
proj['input_folder']

'data/datasets/bbc/politics'

In [4]:
docs = []
paths = glob.glob(os.path.join(proj['input_folder'], '*'))
for path in tqdm(paths):
    if os.path.isfile(path):
        with open(path, 'r') as file:
            text = file.read()
            docs.append({
                'path': path,
                'name': os.path.basename(path),
                'text': text,
                'length': len(text),
            })

print("document 0: {0} characters".format(docs[0]['length']))

print("Number of files from the dataset:", len(docs))
print(docs[0]['name'])

100%|███████████████████████████████████████| 417/417 [00:00<00:00, 3774.96it/s]

document 0: 2137 characters
Number of files from the dataset: 417
096.txt





In [5]:
sentence_tokenizer = PunktSentenceTokenizer()

for doc in docs:
    doc['sentences'] = sentence_tokenizer.tokenize(doc['text'])
    doc['count_sentences'] = len(doc['sentences'])
    
for doc in docs:
    doc['tokens'] = []
    for s in doc['sentences']:
        doc['tokens'] += word_tokenize(s)
    doc['count_tokens'] = len(doc['tokens'])

In [12]:
docs[0].keys()

dict_keys(['path', 'name', 'text', 'length', 'sentences', 'count_sentences', 'tokens', 'count_tokens'])

## Go thru all the documents, and then take the first 75 tokens and save it

In [13]:
# all_first_tokens = []
# for doc in docs:
#     first_n_tokens = doc['tokens'][:75]
#     all_first_tokens.append(first_n_tokens)

all_first_sentences = []
for doc in docs:
    first_n_sentences = doc['sentences'][:2]
    all_first_sentences.append(first_n_sentences)

In [14]:
import pickle
with open("first_sentences.pickle", "wb") as f:
    pickle.dump(all_first_sentences, f)

## Feed tokens to LLM and generate new documents

In [15]:
import pickle
# all_first_tokens = pickle.load(open("first_tokens.pickle", "rb"))
all_first_tokens = pickle.load(open("first_sentences.pickle", "rb"))

In [16]:
model_name = "microsoft/phi-2"


In [17]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.31it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
# Model generation parameters, tweak around max_length and temperature for more creative outputs
# https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
generation_parameters = {
    "max_length": 1000,
    "temperature": 0.9,
    "top_k": 10,
    "top_p": 0.95,
    "repetition_penalty": 1.2,
    "num_return_sequences": 1,
    "do_sample": True,
    # "eos_token_id": tokenizer.eos_token_id
}

In [19]:
from tqdm import tqdm

In [6]:
" ".join(all_first_tokens[1])

"February poll claim 'speculation' Reports that Tony Blair is planning a snap general election for February 2005 have been described as `` idle speculation '' by Downing Street . A spokesman said he had `` no idea '' where the reports in the Sunday Times and Sunday Telegraph had come from . The papers suggest ministers believe the government could benefit from a `` Baghdad bounce '' following successful Iraq elections in January . A"

In [21]:
from pathlib import Path
save_path = Path("data/generated_articles_4")
if not save_path.is_dir():
    save_path.mkdir(parents=True, exist_ok=True)

In [20]:
len(all_first_tokens)

417

In [26]:
# no_words = 512 # no of words to generate

for i in tqdm(range(len(all_first_tokens))):
# for i in range(10):
    first_words = ' '.join(all_first_tokens[i])
    # print(first_words)
    prompt = f'''
    Instruct: Not coding. Come up with a 1000 word news article about politics. Starting with "{first_words}". Output:
    '''
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    generated_ids = model.generate(**model_inputs, **generation_parameters)
    generated_ids_without_prompt = generated_ids[0][len(model_inputs['input_ids'][0]):].unsqueeze(0)
    output_results = tokenizer.batch_decode(generated_ids_without_prompt, skip_special_tokens=False)[0]
    # print(output_results)
    fn = f"result_{i}.txt"
    fp = save_path / fn
    with fp.open("w", encoding="utf-8") as f:
        f.write(output_results)

  0%|                                                   | 0/417 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|                                           | 1/417 [00:00<03:01,  2.29it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▏                                          | 2/417 [00:01<06:26,  1.07it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|▎                                          | 3/417 [00:01<04:10,  1.65it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|▍                                          | 4/417 [00:04<08:16,  1.20s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|▌                                          | 5/417 [00:05<09:31,  1.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|▌                                          | 6/417 [00:10<17:14,  2.52s/it]Setting 

In [9]:
output_results

'"""\n\n    text = text_generator.sample(n=512)\n    print (f\'generated: {len(text)}\') <|endoftext|>'

In [None]:
first_words