In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM 

In [12]:
### load faceboo/opt-1.3b model in 8 bit format to save memory 
OPT = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [13]:
inp= " The quick brown fox jumps over the lazy dog. The dog is not happy about it."

In [29]:
inp_tokenized= tokenizer(inp , return_tensors="pt")


In [30]:
inp_tokenized

{'input_ids': tensor([[    2,    20,  2119,  6219, 23602, 13855,    81,     5, 22414,  2335,
             4,    20,  2335,    16,    45,  1372,    59,    24,     4]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [31]:
tokens= inp.split()
len(tokens)

16

each word is not always one token!

Some words get split into multiple subword tokens.

E.g., "dog." might split into "dog" + ".".

Or "it." → "it" + "."



***********************************

## model archetecture 

In [32]:
print(OPT.model)

OPTModel(
  (decoder): OPTDecoder(
    (embed_tokens): Embedding(50272, 2048, padding_idx=1)
    (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-23): 24 x OPTDecoderLayer(
        (self_attn): OPTAttention(
          (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (out_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
        )
        (activation_fn): ReLU()
        (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear8bitLt(in_features=2048, out_features=8192, bias=True)
        (fc2): Linear8bitLt(in_features=8192, out_features=2048, bias=True)
        (final_layer_norm): LayerNorm((2048,), eps=1e-05, e

In [33]:
inp_tokenized['input_ids'].shape

torch.Size([1, 19])

In [40]:
print(next(OPT.model.parameters()).device)  # Model device


cuda:0


In [41]:
import torch 

In [42]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
device


'cuda'

In [43]:
embedding_input = OPT.model.decoder.embed_tokens(inp_tokenized['input_ids'].to(device))
embedding_input # output 

tensor([[[-0.0407,  0.0519,  0.0574,  ..., -0.0263, -0.0355, -0.0260],
         [-0.0464,  0.0228,  0.0339,  ...,  0.0076, -0.0065, -0.0167],
         [-0.0455, -0.0236, -0.0121,  ...,  0.0043, -0.0166,  0.0193],
         ...,
         [ 0.0408,  0.0363,  0.0021,  ..., -0.0348,  0.0170, -0.0540],
         [ 0.0046,  0.0081,  0.0311,  ...,  0.0173,  0.0141, -0.0444],
         [ 0.0281,  0.0338,  0.0049,  ...,  0.0564,  0.0444, -0.0569]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<EmbeddingBackward0>)

In [44]:
#size 
embedding_input.size()

torch.Size([1, 19, 2048])

In [45]:
## layer  
OPT.model.decoder.embed_tokens

Embedding(50272, 2048, padding_idx=1)

In [46]:
#embed positions 
OPT.model.decoder.embed_positions

OPTLearnedPositionalEmbedding(2050, 2048)

inp_tokenized

In [49]:
inp_tokenized

{'input_ids': tensor([[    2,    20,  2119,  6219, 23602, 13855,    81,     5, 22414,  2335,
             4,    20,  2335,    16,    45,  1372,    59,    24,     4]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [54]:
#embed positions 
embed_pos_input = OPT.model.decoder.embed_positions(inp_tokenized['attention_mask'])
embed_pos_input

tensor([[[-8.1406e-03, -2.6221e-01,  6.0768e-03,  ...,  1.7273e-02,
          -5.0621e-03, -1.6220e-02],
         [-8.0585e-05,  2.5000e-01, -1.6632e-02,  ..., -1.5419e-02,
          -1.7838e-02,  2.4948e-02],
         [-9.9411e-03, -1.4978e-01,  1.7557e-03,  ...,  3.7117e-03,
          -1.6434e-02, -9.9087e-04],
         ...,
         [ 6.8130e-03, -4.7180e-02, -6.3515e-03,  ..., -1.2741e-02,
           2.6345e-04,  1.1848e-02],
         [ 7.9651e-03, -1.4923e-02, -2.2873e-02,  ...,  7.7009e-04,
          -3.7445e-02,  1.2596e-02],
         [-1.1832e-04, -2.1637e-02, -8.1110e-04,  ...,  5.6190e-03,
          -4.0016e-03, -1.0094e-02]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<EmbeddingBackward0>)

In [51]:
##size
OPT.model.decoder.embed_positions

OPTLearnedPositionalEmbedding(2050, 2048)

In [52]:
OPT.model

OPTModel(
  (decoder): OPTDecoder(
    (embed_tokens): Embedding(50272, 2048, padding_idx=1)
    (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-23): 24 x OPTDecoderLayer(
        (self_attn): OPTAttention(
          (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (out_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
        )
        (activation_fn): ReLU()
        (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear8bitLt(in_features=2048, out_features=8192, bias=True)
        (fc2): Linear8bitLt(in_features=8192, out_features=2048, bias=True)
        (final_layer_norm): LayerNorm((2048,), eps=1e-05, e

tensor([[[-0.0407,  0.0519,  0.0574,  ..., -0.0263, -0.0355, -0.0260],
         [-0.0464,  0.0228,  0.0339,  ...,  0.0076, -0.0065, -0.0167],
         [-0.0455, -0.0236, -0.0121,  ...,  0.0043, -0.0166,  0.0193],
         ...,
         [ 0.0408,  0.0363,  0.0021,  ..., -0.0348,  0.0170, -0.0540],
         [ 0.0046,  0.0081,  0.0311,  ...,  0.0173,  0.0141, -0.0444],
         [ 0.0281,  0.0338,  0.0049,  ...,  0.0564,  0.0444, -0.0569]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<EmbeddingBackward0>)

In [56]:
## token embedding + position embedding  =  real embedding
embed_position_embeddding= embedding_input + embed_pos_input 

In [59]:
OPT.model.decoder.layers[0].self_attn

OPTAttention(
  (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
  (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
  (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
  (out_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
)

In [60]:
OPT.model.decoder.layers[0].self_attn(embed_position_embeddding)

(tensor([[[-0.0136, -0.0095,  0.0012,  ...,  0.0067, -0.0018,  0.0131],
          [-0.0134, -0.0098,  0.0028,  ...,  0.0085,  0.0001,  0.0122],
          [-0.0133, -0.0055,  0.0040,  ...,  0.0097,  0.0019,  0.0138],
          ...,
          [-0.0122, -0.0092,  0.0058,  ...,  0.0094,  0.0015,  0.0094],
          [-0.0123, -0.0095,  0.0059,  ...,  0.0095,  0.0016,  0.0092],
          [-0.0122, -0.0098,  0.0059,  ...,  0.0093,  0.0016,  0.0093]]],
        device='cuda:0', dtype=torch.float16, grad_fn=<MatMul8bitLtBackward>),
 None,
 None)

In [61]:
hidden_states, _ , _ = OPT.model.decoder.layers[0].self_attn(embed_position_embeddding)

In [72]:
hidden_states

tensor([[[-0.0136, -0.0095,  0.0012,  ...,  0.0067, -0.0018,  0.0131],
         [-0.0134, -0.0098,  0.0028,  ...,  0.0085,  0.0001,  0.0122],
         [-0.0133, -0.0055,  0.0040,  ...,  0.0097,  0.0019,  0.0138],
         ...,
         [-0.0122, -0.0092,  0.0058,  ...,  0.0094,  0.0015,  0.0094],
         [-0.0123, -0.0095,  0.0059,  ...,  0.0095,  0.0016,  0.0092],
         [-0.0122, -0.0098,  0.0059,  ...,  0.0093,  0.0016,  0.0093]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MatMul8bitLtBackward>)

In [71]:
hidden_states.size()

torch.Size([1, 19, 2048])

In [74]:
#LAYERS
OPT.model.decoder.layers[0].self_attn

OPTAttention(
  (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
  (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
  (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
  (out_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
)

In [75]:
## bart 

In [77]:
from transformers import AutoModel, AutoTokenizer 
BART = AutoModel.from_pretrained('facebook/bart-large')
BART

BartModel(
  (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
  (encoder): BartEncoder(
    (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-11): 12 x BartEncoderLayer(
        (self_attn): BartAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True

In [79]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sum = summarizer("""Gaga was best known in the 2010s for pop hits like “Poker Face” and avant-garde experimentation on albums like “Artpop,” and Bennett, a singer who mostly stuck to standards, was in his 80s when the pair met. And yet Bennett and Gaga became fast friends and close collaborators, which they remained until Bennett’s death at 96 on Friday. They recorded two albums together, 2014’s “Cheek to Cheek” and 2021’s “Love for Sale,” which both won Grammys for best traditional pop vocal album.""", min_length=20, max_length=50)
sum

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


[{'summary_text': 'Bennett and Gaga became fast friends and close collaborators. They recorded two albums together, 2014\'s "Cheek to Cheek" and 2021\'s "Love for Sale"'}]

In [82]:
sum[0]['summary_text']

'Bennett and Gaga became fast friends and close collaborators. They recorded two albums together, 2014\'s "Cheek to Cheek" and 2021\'s "Love for Sale"'

In [84]:
classifier= pipeline('text-classification')
classifier

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x20dfeb917d0>

In [87]:
classifier("This resturent is awesome")

[{'label': 'POSITIVE', 'score': 0.9998695850372314}]

In [90]:
gpt= AutoModel.from_pretrained('gpt2')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [94]:
BERT = AutoModel.from_pretrained('bert-base-uncased') 
print(BERT)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [95]:
AutoModel.from_pretrained('gpt2')

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

import numpy 

In [98]:
import numpy as np 

def self_attention(query, key, value, mask=None):
    ## compute attention score 
    np.dot(query, key.T)
    
    if mask is not None : 
        ## apply mask by setting masked position to large negative value 
        scores = scores  + -1e9
        
    ##apply softmax 
    attention_weights= np.exp(scores) / np.sum(np.exp(scores) , axis= -1 ,keepdims= True )
    
    # compute weighted sum of value vectors 
    output= np.dot(attention_weights, value)

In [105]:
import os 
from dotenv import load_dotenv 

load_dotenv() #load variables form env 
api_key= os.getenv("COHERE_key")

In [106]:
import cohere

In [107]:
api_key

'aqhwRQAfBFsQb2pvIdXsrOgbTgscshNDqRQ48Hcz'

In [118]:
co = cohere.Client(api_key)

In [119]:
chat_history=[
{
    "role" : "USER" , 
    "message" : "who discoverd gravity" 
},

 {
     "role" : "CHATBOT", 
     "message": "The man who widely credited with discovering gravity is Sir Isac Newton"
     
 }
] 

chat_history

[{'role': 'USER', 'message': 'who discoverd gravity'},
 {'role': 'CHATBOT',
  'message': 'The man who widely credited with discovering gravity is Sir Isac Newton'}]

In [120]:
message = "what year was he bron ?" 
message

'what year was he bron ?'

In [122]:
response= co.chat(
    chat_history= chat_history , 
    message= message ,
    ## to perform web search before answering we can use our own connectors 
    connectors= [{"id": "web-search"}]
    
)

response




In [124]:
response.text

"Sir Isaac Newton was born on December 25, 1642. At the time of Newton's birth, England used the Julian calendar. However, when England adopted the Gregorian calendar in 1752, his birthday became January 4, 1643."

In [None]:
from transformers import AutoModelForCausalLM , AutoTokenizer 
import torch 

#downloade model 
model_id= "meta-llama/Llama-2-7"

In [125]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "meta-llama/Llama-2-7b-chat-hf"
cache_dir = "D:/transformers_cache"  # Set this to any folder on your D drive

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    cache_dir=cache_dir
)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OSError: The paging file is too small for this operation to complete. (os error 1455)