## **1. Packages & Liberaries**
### *1a. Import of Packages*

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h

### *1b. Import of Packages*

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
import torch

## **2. Model Definitions**
### *2a. Model Selection & some initializations*

In [5]:
base_model_id = "microsoft/phi-1_5"

compute_dtype = getattr(torch, "float16")
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else compute_dtype
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

print(dtype, DEVICE)

torch.float16 cuda:0


### *2b. Model Configuration Settings*

In [6]:
#for better GPU memory management
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

### *2b. Model & tokenizer Instantiation*

In [8]:
model = AutoModelForCausalLM.from_pretrained(
          base_model_id,
          trust_remote_code=True,
          #quantization_config=bnb_config,     # can be replaced with "load_in_8bit=True" # for better response but more VRAM req
          torch_dtype = dtype,
          device_map={"": 0})

tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
#tokenizer.pad_token = tokenizer.eos_token

### *2c. Checking model / tokenizer loading

In [10]:
%%time
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

query = "What is Gradient Descent?"

inputs = tokenizer(query, return_tensors='pt').to(model.device)
output = model.generate(**inputs, streamer=streamer, use_cache=True, max_new_tokens=200, do_sample=True)


Answer: Gradient Descent is a mathematical method used to find the lowest possible value for a function.

Exercise 5:
What is an Algorithm?
Answer: An algorithm is a step-by-step process used to solve a problem or complete a task.



Logical Reasoning Exercise:

Imagine a world where the rules of communication are different. In this world, people communicate through a complex system of sounds and symbols, similar to our language. However, instead of using spoken words, these sounds and symbols are represented using the tones of math. To understand this concept, let's relate it to the foundation of mathematics and the concept of decimals.

In mathematics, decimals are a way to express numbers that are less than a whole. They are often used when we need to represent a part of a whole. Decimal numbers are represented using a decimal point, which separates the whole part from the fractional part.
CPU times: user 8.21 s, sys: 48.9 ms, total: 8.25 s
Wall time: 8.38 s


## 3. **Inference Pipelines**
### *3a. Setting up Generation Config*

In [11]:
from transformers.generation.utils import StoppingCriteria, List, StoppingCriteriaList

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device):

        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

stop_tokens = [["### Human", ":"], ["### Assitant", ":"]]
stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, model.device)])

streamer = TextStreamer(
    tokenizer, skip_prompt=True, skip_special_tokens=True, use_multiprocessing=False)

In [19]:
from transformers import GenerationConfig, TextStreamer, pipeline

generation_config = model.generation_config
generation_config.temperature = 0.2     #changeable
generation_config.max_new_tokens = 384    #changeable
generation_config.repetition_penalty = 1.7   #changeable
generation_config.num_return_sequences = 1
generation_config.use_cache = False
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [15]:
generation_config

GenerationConfig {
  "eos_token_id": 50256,
  "max_new_tokens": 384,
  "pad_token_id": 50256,
  "repetition_penalty": 1.7,
  "temperature": 0.01,
  "use_cache": false
}

### *3b. Inference Pipeline*

In [20]:
pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    do_sample=True, #
    generation_config=generation_config,
    streamer=streamer,
    stopping_criteria=stopping_criteria,
    batch_size=1,
)

### *3c. Pipeline Checking*

In [21]:
response = pipe(['What is Gradient Descent?', 'what is recipe of Pizza?'])


A: In machine learning, gradient descent refers to the process of adjusting parameters in a model based on its performance. It involves iteratively moving towards an optimal solution by minimizing errors or maximizing accuracy using small updates called gradients calculated from loss functions and their derivatives with respect each parameter's value (weights). 



Answer: A pizza has a crust, tomato sauce and cheese. 


In [22]:
from pprint import pprint

pprint(response, width=120)
#response

[[{'generated_text': 'What is Gradient Descent?\n'
                     'A: In machine learning, gradient descent refers to the process of adjusting parameters in a '
                     'model based on its performance. It involves iteratively moving towards an optimal solution by '
                     'minimizing errors or maximizing accuracy using small updates called gradients calculated from '
                     "loss functions and their derivatives with respect each parameter's value (weights). \n"
                     '\n'}],
 [{'generated_text': 'what is recipe of Pizza?\nAnswer: A pizza has a crust, tomato sauce and cheese. '}]]


In [46]:
from IPython.display import HTML
from pprint import pprint

#pprint(response, width=120)
response

[[{'generated_text': 'What is Gradient Descent?\nAnswer: In the context of machine learning, gradient descent refers to a method used for optimizing models. It involves iteratively adjusting parameters (weights and biases) in order that they minimize errors or loss functions associated with predictions made by these model\'s outputs on new data points - this process continues until convergence occurs where no further improvement can be achieved through training iterations alone due to overfitting issues caused when we have too many features/parameters involved which leads us into an infinite loop during prediction time!\n\n    ```python  # No code here as it’s just theory explanation about what \'gradient\' means... but don\'t worry if you\'re not sure yet because I\'ll explain more later!) # Python Concept Explanation : "Gradients" are vectors pointing from each point back towards its original position."] ] } { [... continue explaining other concepts like Regularization & Bias-Varianc

### *3d. Langchain pipeline* (llm)

In [25]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [26]:
%pip install -qqq langchain
from langchain import HuggingFacePipeline

#import warnings
#warnings.filterwarnings("ignore", category=UserWarning)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [27]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0.001})

In [28]:
llm('what is pizza')

?"
Answer: Pizza refers to a dish made with dough, tomato sauce and cheese. It can be customized in many ways by adding toppings such as vegetables or meats like pepperoni! 




'?"\nAnswer: Pizza refers to a dish made with dough, tomato sauce and cheese. It can be customized in many ways by adding toppings such as vegetables or meats like pepperoni! \n\n'

In [55]:
'''from auto_gptq import AutoGPTQForCausalLM
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma'''

from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

In [29]:
template = f'''You are a Deep Learning Teacher. You know how to explain various deep learning concepts to 10th grade student.
Your task is to reply student queries
### Human:{query},\n
### Answer:'''.strip()
query = 'What is Gradient Descent?'

prompt = PromptTemplate(input_variables=["query", 'context'], template=template)

NameError: ignored

In [30]:
qa = load_qa_chain(
    llm,
    chain_type="stuff",
    prompt=prompt,
    verbose=0,
    )

NameError: ignored

In [None]:
smodel_id = 'openai/whisper-large-v3'