In [None]:
import torch
torch.cuda.get_device_capability()

(7, 5)

In [None]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-0.29.1-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.29.1


In [None]:
import os
os._exit(00)

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
from torch import cuda, bfloat16
import transformers
import json
from tqdm import tqdm
import pickle
import re
import time


class LlamaModel:
    def __init__(self, hf_token ,model_id):
        self.model_id = model_id
        self.device = self.get_device()
        self.hf_auth = hf_token
        self.bnb_config = self.get_bnb_config()
        self.initalize_model()
        print(f"Model loaded on {self.device}")

    @staticmethod
    def get_device():
        return f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

    @staticmethod
    def get_bnb_config():
        return transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

    def initalize_model(self):
        # begin initializing HF items, need auth token for these

        model_config = transformers.AutoConfig.from_pretrained(
            self.model_id, use_auth_token=self.hf_auth
        )

        model = transformers.AutoModelForCausalLM.from_pretrained(
            self.model_id,
            trust_remote_code=True,
            config=model_config,
            quantization_config=self.bnb_config,
            device_map='auto',
            use_auth_token=self.hf_auth
        )

        tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.model_id, use_auth_token=self.hf_auth
        )

        self.generate_text = transformers.pipeline(
            model=model, tokenizer=tokenizer,
            return_full_text=True,  # langchain expects the full text
            task='text-generation',
            # we pass model parameters here too
            #stopping_criteria=stopping_criteria,  # without this model rambles during chat
            temperature=0.0000001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
            max_new_tokens=2048,  # mex number of tokens to generate in the output
            repetition_penalty=1.1  # without this output begins repeating
        )

    def call_LLM(self, _prompt):
        res = self.generate_text(_prompt)
        res = res[0]["generated_text"].split('[/INST]')[-1]
        return res

    def llm_response(self,prompt1,text) :

        llm_response1 = self.call_LLM(prompt1.format(text))

        return llm_response1


if __name__ == "__main__" :

    df = pd.DataFrame(['I am Sagar Agrawal',
                             "I am from Bhopal",
                             "Kolkata won the match"],
                            columns = ['text'])

    ## Model initialisations
    hf_token = 'change_this'
    model_id = 'meta-llama/Llama-2-13b-chat-hf'
    llama_model = LlamaModel(hf_token,model_id)

    ## Hierarchy level which we want to predict
    translation_list = []

    ## Defining Prompts
    prompt = """
    <s>[INST] <<SYS>>
    You are a translator. Your job is to translate the provided text to Hindi. Please genereted only translated text in Hindi. Do not give any other details.

    <</SYS>>
    {}
    [/INST]
    """

    start = time.time()
    for text in df.text.tolist() :
        prediction = llama_model.llm_response(prompt,text)
        translation_list.append(prediction)
    df['translated_text' ] = translation_list
    end = time.time()

    print(f"Time taken to predict for {len(df)} titles is : {(end - start)/60} minutes.")



config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Model loaded on cuda:0
Time taken to predict for 3 titles is : 0.22903844912846882 minutes.


In [None]:
df

Unnamed: 0,text,translated_text
0,I am Sagar Agrawal,\n सगर अग्रवाल
1,I am from Bhopal,\n मैं भोपाल से हूँ।
2,Kolkata won the match,\n कोलकाता मैच जीता
