# Here is the XGen AI bot 🤖, made with python and backed by llama3 💖

# Run all the Cells and wait ⌚
---
to run all cells go to `Runtime` -> `Run all` from the dropdown list.

In [1]:
!pip install python-telegram-bot transformers bitsandbytes datasets --quiet
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes --quiet

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-6zoglz92/unsloth_2165e0ad1b354788b9edc318ae5a39d1
  Running command git clone -q https://github.com/unslothai/unsloth.git /tmp/pip-install-6zoglz92/unsloth_2165e0ad1b354788b9edc318ae5a39d1
  Resolved https://github.com/unslothai/unsloth.git to commit 933d9fe2cb2459f949ee2250e90a5b610d277eab
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (PEP 517) ... [?25ldone
[?25h  Created wheel for unsloth: filename=unsloth-2024.6-py3-none-any.whl size=117332 sha256=71274d0f581126e3124b6e602bd79ec1b67454d52081413f66cb9697142df085
  Stored in directory: /tmp/pip-ephem-wheel-cache-bm70h9yu/wheels/0b/bf/f5/61523189908a01bce8752a181f02f8b057ffc2c792447d39ff
Successfully buil

# llm configuration 🚀
---
I'm using the quantized version of the LLM(large language model) which eventually is **lightweight in RESOURCE usage** as compared with original `LLAMA3`.


In [3]:
from unsloth import FastLanguageModel
import torch
from huggingface_hub import notebook_login
# notebook_login()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
    cache_dir = "./llama"
)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.581 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# finetuning ⏲️

###part 1 : loading the alpaca format ( chat based template)

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

#------------------------------------#------------------chat template------------------#------------------------------------


from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Downloading readme:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9033 [00:00<?, ? examples/s]

Map:   0%|          | 0/9033 [00:00<?, ? examples/s]

## part 2 : training

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/9033 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 9,033 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.3273
2,1.4318
3,1.6716
4,1.8284
5,1.7234
6,1.448
7,1.1256
8,1.6615
9,1.7151
10,1.3801


# test it out ✅

# run only if lora model(finetuned) is present

In [1]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "llama/lora", 
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.581 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [38]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
)

from unsloth.chat_templates import get_chat_template

# FastLanguageModel.for_inference(model)

def generate(prompt):
    messages = [
      {"from": "human", "value": f"{prompt}"},
    ]
    inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True,
      return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 111
                            , use_cache = True,
                           pad_token_id=tokenizer.eos_token_id
                           # attention_mask=
                          )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # print(response)
    response_frmt = response[0].split("assistant")
    response_frmt = response_frmt[1].strip()
    return response_frmt

                                
  # return outputs

generate(input())

  green revolution negative inpact


'The Green Revolution, which began in the 1940s and 1950s, was a period of rapid agricultural development in many parts of the world, particularly in Asia and Latin America. The Green Revolution was characterized by the widespread adoption of new crop varieties, irrigation systems, and fertilizers, which allowed farmers to increase their yields and produce more food.\n\nHowever, the Green Revolution also had some negative impacts. For example, the increased use of fertilizers and pesticides led to soil degradation and water pollution. The increased use of irrigation systems also led to'

# elsewise run (immediate after training)

In [1]:
# model saving
model.save_pretrained("llama3-chat-bnb1122") # Local saving
# model.push_to_hub("pradachan/llama3-chat-bnb", token = "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")
# model.push_to_hub_merged("pradachan/llama3-chat-bnb-", tokenizer, save_method = "merged_16bit", token = "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")
# model.push_to_hub_merged("pradachan/llama3-chat-bnb", tokenizer, save_method = "merged_8bit", token = "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")

NameError: name 'model' is not defined

In [15]:
from unsloth.chat_templates import get_chat_template



tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
)

FastLanguageModel.for_inference(model)

def generate(prompt):
    messages = [
      {"from": "human", "value": f"{prompt}"},
    ]
    inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True,
      return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 150, use_cache = True,
                           pad_token_id=tokenizer.eos_token_id
                           # attention_mask=
                          )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # print(response)
    response_frmt = response[0].split("assistant")
    response_frmt = response_frmt[1].strip()
    return response_frmt

                                
  # return outputs

generate("hi how are you")

AttributeError: 'NoneType' object has no attribute 'gradient_checkpointing'

In [10]:

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import PreTrainedTokenizerFast, LlamaForCausalLM
from peft import PeftModel
from unsloth import FastLanguageModel


# model = AutoModelForCausalLM.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit", cache_dir="./llama")
# tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit", cache_dir="./llama")

def load_model(model_path, lora_adapter_path):
    # Load the tokenizer
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)

    # Load the model
    model = LlamaForCausalLM.from_pretrained(model_path)

    # Load the LoRA adapter
    model = PeftModel.from_pretrained(model, lora_adapter_path)

    # Enable native 2x faster inference
    model = FastLanguageModel.for_inference(model)

    return model, tokenizer

if __name__ == "__main__":
    model_path = "llama/models--unsloth--llama-3-8b-Instruct-bnb-4bit/snapshots/2950abc9d0b34ddd43fd52bbf0d7dca82807ce96/"
    lora_adapter_path = "llama/lora"

    model, tokenizer = load_model(model_path, lora_adapter_path)
    print("Model and LoRA adapter loaded successfully!")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model and LoRA adapter loaded successfully!


In [19]:
# from unsloth.chat_templates import get_chat_template

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "llama-3",
#     mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
# )


def generate(prompt):
    messages = [
      {"from": "human", "value": f"{prompt}"},
    ]
    inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True,
      return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 150, use_cache = True,
                           pad_token_id=tokenizer.eos_token_id
                           # attention_mask=
                          )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # print(response)
    response_frmt = response[0].split("assistant")
    response_frmt = response_frmt[1].strip()
    return response_frmt

                                
  # return outputs

generate("hi how are you")

AttributeError: 'NoneType' object has no attribute 'generate'

# (Sagemaker) to flush out gpu memory 

In [54]:
import gc
import torch as t

t.cuda.empty_cache()
gc.collect()

torch.cuda.synchronize()

print("done")


done


# telegram bot code 🤖

In [None]:
import random
from telegram import Update
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters
import google.generativeai as ai
import nest_asyncio as nio
import asyncio as aio
from google.colab import userdata


#its being used inorder to smoothly run our bot for indefinitely time in colab
nio.apply()

tkn = userdata.get('X_AI')
api = userdata.get('API_KEY')

#ai model
ai.configure(api_key = api)
model = ai.GenerativeModel('gemini-pro')


#tele based functions
async def start(update : Update, context : ContextTypes.DEFAULT_TYPE):
    greetings = [
        "Hey there! XgenAI at your service, what's going on?",
        "Oh hi! It's XgenAI here. What can I do for you today?",
        "XgenAI here! Ready for some fun? What can we discuss today?",
        "Hi there, you've reached XgenAI. How can I assist you?",
        "Namaste, XgenAI here! What would you like to talk about?"
    ]
    greet : str = random.choice(greetings)
    await update.message.reply_text(greet)


async def about(update : Update, context : ContextTypes.DEFAULT_TYPE):
    await update.message.reply_text("xGenAI, is an integration of Telegram and Google's generative-ai model.")


async def generate_prompt(update : Update, context : ContextTypes.DEFAULT_TYPE):
    text : str = update.message.text

    if text.startswith("."):
      text = text.replace('.','')
      res = generate(text)
      await update.message.reply_text(res)
    else:
      await update.message.reply_text(handle_res(text))


#ai model based
def generate(text :str)-> str:
    res = model.generate_content(text)
    return res.text


#message handler
def handle_res(text : str) -> str:
  if text.startswith("."):
    pass
  else:
    return "prompt should be like this -> .prompt"


if __name__ == '__main__':
    print("running...")

    app = ApplicationBuilder().token(tkn).build()

    #cmd handler tele
    start_handler = CommandHandler('start', start)
    app.add_handler(start_handler)

    about_handler = CommandHandler('about', about)
    app.add_handler(about_handler)

    #message handler
    app.add_handler(MessageHandler(filters.TEXT, generate_prompt))


    app.run_polling()
