<a href="https://colab.research.google.com/github/venu72561-chinnam/AI-ML-project/blob/main/LLMarchitecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch accelerate gradio -q

In [2]:
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer, pipeline
)
from datasets import Dataset
import gradio as gr
import torch

In [3]:
sample_text = "Large Language Models generate human-like text."

# Tokenizer and model (DistilBERT for simplicity)
tokenizer_demo = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_demo = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:
tokens = tokenizer_demo.tokenize(sample_text)
print(" Tokens:", tokens)

 Tokens: ['large', 'language', 'models', 'generate', 'human', '-', 'like', 'text', '.']


In [5]:
input_ids = tokenizer_demo(sample_text, return_tensors="pt").input_ids
embeddings = model_demo.distilbert.embeddings.word_embeddings(input_ids)
print("Embedding shape:", embeddings.shape)

Embedding shape: torch.Size([1, 11, 768])


In [6]:
data = [
    {"instruction": "Summarize the given text.",
     "input": "Machine learning is a field of AI that enables systems to learn from data.",
     "output": "Machine learning enables systems to learn from data."},
    {"instruction": "Explain tokenization.",
     "input": "",
     "output": "Tokenization splits text into smaller units like words or subwords."},
    {"instruction": "Define embedding.",
     "input": "",
     "output": "Embedding represents words as vectors capturing their meaning."}
]

In [7]:
dataset = Dataset.from_list(data)

In [8]:
def preprocess_mlm(example):
    text = f"{example['instruction']} {example['input']} {example['output']}"
    return tokenizer_demo(text, truncation=True, padding="max_length", max_length=64)

In [9]:
tokenized_dataset = dataset.map(preprocess_mlm, batched=False)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_demo, mlm=True, mlm_probability=0.15
)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [10]:
training_args = TrainingArguments(
    output_dir="./distilbert_mlm",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    logging_steps=1,
    save_strategy="no",
    learning_rate=5e-5
)

trainer = Trainer(
    model=model_demo,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [11]:
t5_model_name = "t5-small"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

def preprocess_t5(example):
    inp = f"Instruction: {example['instruction']} Input: {example['input']}"
    model_inputs = t5_tokenizer(inp, truncation=True, padding="max_length", max_length=128)
    labels = t5_tokenizer(example['output'], truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

t5_tokenized = dataset.map(preprocess_t5, batched=False)

training_args_t5 = TrainingArguments(
    output_dir="./t5_instruction_model",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    logging_steps=1,
    save_strategy="no",
    learning_rate=3e-5
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=t5_tokenized
)

trainer_t5.train()
trainer_t5.save_model("./t5_instruction_model")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [12]:
!pip install transformers datasets torch accelerate gradio -q


from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer, pipeline
)
from datasets import Dataset
import gradio as gr
import torch

sample_text = "Large Language Models generate human-like text."

# Tokenizer and model (DistilBERT for simplicity)
tokenizer_demo = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_demo = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

tokens = tokenizer_demo.tokenize(sample_text)
print(" Tokens:", tokens)

input_ids = tokenizer_demo(sample_text, return_tensors="pt").input_ids
embeddings = model_demo.distilbert.embeddings.word_embeddings(input_ids)
print("Embedding shape:", embeddings.shape)

# -------------------------------------------------------------
#  Step 2: Build Instruction Dataset
# -------------------------------------------------------------
data = [
    {"instruction": "Summarize the given text.",
     "input": "Machine learning is a field of AI that enables systems to learn from data.",
     "output": "Machine learning enables systems to learn from data."},
    {"instruction": "Explain tokenization.",
     "input": "",
     "output": "Tokenization splits text into smaller units like words or subwords."},
    {"instruction": "Define embedding.",
     "input": "",
     "output": "Embedding represents words as vectors capturing their meaning."}
]
dataset = Dataset.from_list(data)

# -------------------------------------------------------------
#  Step 3: Mini Pre-training using Masked LM (DistilBERT)
# -------------------------------------------------------------
def preprocess_mlm(example):
    text = f"{example['instruction']} {example['input']} {example['output']}"
    return tokenizer_demo(text, truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(preprocess_mlm, batched=False)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_demo, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./distilbert_mlm",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    logging_steps=1,
    save_strategy="no",
    learning_rate=5e-5
)

trainer = Trainer(
    model=model_demo,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

# -------------------------------------------------------------
#  Step 4: Instruction Fine-tuning (T5-small)
# -------------------------------------------------------------
t5_model_name = "t5-small"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

def preprocess_t5(example):
    inp = f"Instruction: {example['instruction']} Input: {example['input']}"
    model_inputs = t5_tokenizer(inp, truncation=True, padding="max_length", max_length=128)
    labels = t5_tokenizer(example['output'], truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

t5_tokenized = dataset.map(preprocess_t5, batched=False)

training_args_t5 = TrainingArguments(
    output_dir="./t5_instruction_model",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    logging_steps=1,
    save_strategy="no",
    learning_rate=3e-5
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=t5_tokenized
)

trainer_t5.train()
trainer_t5.save_model("./t5_instruction_model")

# -------------------------------------------------------------
# Step 5: Gradio Interface for Instruction-Following
# -------------------------------------------------------------
pipe = pipeline("text2text-generation", model="./t5_instruction_model", tokenizer=t5_tokenizer)

def generate_text(instruction, user_input):
    prompt = f"Instruction: {instruction} Input: {user_input}"
    return pipe(prompt, max_length=80)[0]['generated_text']

demo = gr.Interface(
    fn=generate_text,
    inputs=[gr.Textbox(label="Instruction"), gr.Textbox(label="Input Text")],
    outputs=gr.Textbox(label="Output"),
    title="Mini GenAI Instruction Model (Offline, Token-Free)",
    description="T5-small model fine-tuned on local instruction dataset."
)

demo.launch(share=True)

 Tokens: ['large', 'language', 'models', 'generate', 'human', '-', 'like', 'text', '.']
Embedding shape: torch.Size([1, 11, 768])


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [20]:
!pip install transformers datasets sentencepiece accelerate torch gradio -q


import os
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "0"   # allow model download, but no login
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["WANDB_DISABLED"] = "true"      # disables wandb tracking
os.environ["HF_HOME"] = "/tmp"             # use temp storage
os.environ["HF_DATASETS_OFFLINE"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#  Imports
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import gradio as gr

# Create a small instruction dataset
data = [
    {"instruction": "Summarize the given text.",
     "input": "Machine learning is a field of artificial intelligence that enables systems to learn from data and improve automatically.",
     "output": "Machine learning enables systems to learn and improve from data."},
    {"instruction": "Explain what tokenization means in NLP.",
     "input": "",
     "output": "Tokenization is the process of splitting text into smaller units like words or subwords."},
    {"instruction": "Define embedding in simple terms.",
     "input": "",
     "output": "An embedding is a way to represent words as numerical vectors capturing their meaning."},
    {"instruction": "Summarize the text.",
     "input": "Artificial Intelligence enables machines to perform tasks that normally require human intelligence.",
     "output": "AI enables machines to mimic human intelligence."},
]
dataset = Dataset.from_list(data)

# Tokenization / preprocessing
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name, use_auth_token=False)

def preprocess(example):
    inp = f"Instruction: {example['instruction']} Input: {example['input']}"
    model_inputs = tokenizer(inp, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset.map(preprocess, batched=False)

#  Model & training (fine-tuning on small data)
model = T5ForConditionalGeneration.from_pretrained(model_name, use_auth_token=False)

training_args = TrainingArguments(
    output_dir="./t5_instruction_finetune",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="no",
    push_to_hub=False,        # <— prevents token prompt
    report_to=[]              # <— disables any reporting
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_data)
trainer.train()
trainer.save_model("./t5_instruction_model")

#  Inference pipeline
pipe = pipeline("text2text-generation", model="./t5_instruction_model", tokenizer=model_name)

#  Gradio Interface
def generate_text(instruction, user_input):
    prompt = f"Instruction: {instruction} Input: {user_input}"
    result = pipe(prompt, max_length=80)[0]['generated_text']
    return result

demo = gr.Interface(
    fn=generate_text,
    inputs=[gr.Textbox(label="Instruction (e.g. Summarize the text)"),
            gr.Textbox(label="Input Text")],
    outputs=gr.Textbox(label="Generated Output"),
    title=" Mini GenAI Instruction Model (Token-Free)",
    description="Fine-tuned T5-small model trained"
)

demo.launch(share=True)



Map:   0%|          | 0/4 [00:00<?, ? examples/s]



Step,Training Loss


Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4b120df16dad68c40c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


