In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import datasets
import logging
import os
import jsonlines
import sys
import itertools
from transformers import AutoTokenizer # For converting the texts into tokens

In [3]:
import lamini
lamini.api_url = os.getenv("POWERML__PRODUCTION__URL")
lamini.api_key = os.getenv("POWERML__PRODUCTION__KEY")

In [4]:
# Setting up logging
logger = logging.getLogger(__name__)
def setup_logging():
    FORMAT = '%(levelname)s:%(name)s: %(message)s (%(asctime)s; %(filename)s:%(lineno)d)'
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    LEVEL = logging.INFO
    STREAM = sys.stdout
    logging.basicConfig( 
        level=LEVEL, 
        format=FORMAT, 
        datefmt=DATE_FORMAT,
        stream=STREAM,
    )
setup_logging()

### Tokenization and Encodings

In [5]:
# Tokenising texts
# string --encoding--> tokens
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m')
text = "Sample text for testing tokenization"
tokens = tokenizer(text)
logger.info(tokens['input_ids'])

# tokens --encoding--> string
encoded_sentence = tokenizer.decode(tokens['input_ids'])
logger.info(encoded_sentence)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO:__main__: [19352, 2505, 323, 5175, 10669, 1320] (2024-04-04 20:41:13; 1036835545.py:6)
INFO:__main__: Sample text for testing tokenization (2024-04-04 20:41:13; 1036835545.py:10)


In [6]:
list_texts = ['Hi my name is Aditya Patel.', 'I\'m a computer science student at Toronto Metropolitan University.', 'I\'m passionate about software engineering, machine learning.', 'Also athlete who loves watching formula one and cricket.']
encoded_list = tokenizer(list_texts)
print(encoded_list['input_ids'])

# Padding Concept
tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

# Truncation
encoded_texts_truncation = tokenizer(list_texts, max_length=5, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

# Truncation Left
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=5, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

# Truncation and Padding Together
encoded_texts_both = tokenizer(list_texts, max_length=5, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

[[12764, 619, 1416, 310, 2006, 414, 66, 2790, 293, 15], [42, 1353, 247, 4382, 5859, 5974, 387, 13533, 28634, 2499, 15], [42, 1353, 22500, 670, 3694, 11369, 13, 5145, 4715, 15], [9917, 27799, 665, 14528, 7487, 7212, 581, 285, 25154, 15]]
Using padding:  [[12764, 619, 1416, 310, 2006, 414, 66, 2790, 293, 15, 0], [42, 1353, 247, 4382, 5859, 5974, 387, 13533, 28634, 2499, 15], [42, 1353, 22500, 670, 3694, 11369, 13, 5145, 4715, 15, 0], [9917, 27799, 665, 14528, 7487, 7212, 581, 285, 25154, 15, 0]]
Using truncation:  [[12764, 619, 1416, 310, 2006], [42, 1353, 247, 4382, 5859], [42, 1353, 22500, 670, 3694], [9917, 27799, 665, 14528, 7487]]
Using left-side truncation:  [[414, 66, 2790, 293, 15], [387, 13533, 28634, 2499, 15], [11369, 13, 5145, 4715, 15], [7212, 581, 285, 25154, 15]]
Using both padding and truncation:  [[414, 66, 2790, 293, 15], [387, 13533, 28634, 2499, 15], [11369, 13, 5145, 4715, 15], [7212, 581, 285, 25154, 15]]


### Instruction Dataset Preparation

In [7]:
# Grabbing a data set form hugging face and making it reay for training
instruction_tuned_dataset = datasets.load_dataset("tatsu-lab/alpaca", split="train", streaming=True)
m = 5
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for i in top_m:
    print(i)
    print("")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}

{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary colors?\n\n### Resp

In [8]:
prompt_template_with_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [9]:
processed_data = []
for j in top_m:
  if not j["input"]:
    processed_prompt = prompt_template_without_input.format(instruction=j["instruction"])
  else:
    processed_prompt = prompt_template_with_input.format(instruction=j["instruction"], input=j["input"])
  processed_data.append({"input": processed_prompt, "output": j["output"]})
logger.info(processed_data[0])

INFO:__main__: {'input': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'} (2024-04-04 20:41:16; 2009408342.py:8)


In [10]:
try:
    with jsonlines.open("alpaca-instruction-tuning.jsonl", "w") as writer:
        writer.write_all(processed_data)
    logger.info("File alpaca-instruction-tuning.jsonl successfully created.")
except Exception as e:
    logger.exception(e)

INFO:__main__: File alpaca-instruction-tuning.jsonl successfully created. (2024-04-04 20:41:16; 1184919678.py:4)


### Lamini Dataset Preparation

In [11]:
# Loadign the lamini dataset from hugging face
finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = datasets.load_dataset(finetuning_dataset_path)
m = 10
top_m = list(itertools.islice(finetuning_dataset['train'], m))
for i in top_m:
    logger.info(i)

INFO:__main__: {'question': 'How can I evaluate the performance and quality of the generated text from Lamini models?', 'answer': "There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance.", 'input_ids': [2347, 476, 309, 7472, 253, 3045, 285, 3290, 273, 253, 4561, 2505, 432, 418, 4988, 74, 3210, 32, 2512, 403, 2067, 17082, 326, 476, 320, 908, 281, 7472, 253, 3045, 285, 3290, 273, 4561, 2505, 432, 418, 4988, 74, 3210, 13, 1690, 44229, 414, 13, 378, 18