In [1]:
!pip install google-generativeai
!pip install datasets
!pip install -U bitsandbytes
!pip install transformers
!pip install -U peft
!pip install -U "huggingface_hub[cli]"
!pip install -U trl
!pip install pytesseract
!pip install evaluate



In [2]:
import google.generativeai as genai
from datasets import Dataset, DatasetDict
import pandas as pd
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, \
    BitsAndBytesConfig, TrainingArguments, pipeline, logging
import torch
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
from PIL import Image
import pytesseract
from evaluate import load

create dataset, training and testing

In [3]:
qa_pairs_data = pd.read_csv('qa_pairs_merged.csv')
X = qa_pairs_data['Question']
y = qa_pairs_data['Answer']
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.30,random_state=42, shuffle=True)
train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test,y_test],axis=1)
print("Training")
print(train_data.head)
print("Testing")
print(test_data.head)

Training
<bound method NDFrame.head of                                               Question  \
173  In 2D gradient descent with f(θ)=-3θ1 -θ1θ2 + ...   
307  What is unsupervised learning, and how does it...   
165    How does masking work in transformer attention?   
337  What is deep reinforcement learning, and how d...   
19   When Alex was first studying machine learning,...   
..                                                 ...   
71   Why is σ(0) = 0.5 particularly important in lo...   
106  How is the bias vector 𝑊𝑙0 dimensioned for lay...   
270  What are kernels in CNNs, and how do they impa...   
348  How is deep learning expected to evolve over t...   
102  What shape do the decision boundaries of indiv...   

                                                Answer  
173  -1.11, calculated as -3(1.2) - (1.2)(0.7) + 2(...  
307  Unsupervised learning is a type of machine lea...  
165  Masking sets certain attention scores to negat...  
337  Deep Reinforcement Learning (DR

In [4]:
# Convert the pandas DataFrames to Hugging Face datasets
dataset_train = Dataset.from_pandas(train_data)
dataset_val = Dataset.from_pandas(test_data)

# Create a DatasetDict containing both train and test datasets
dataset = DatasetDict({"train": dataset_train, "test": dataset_val})

In [5]:
def format_qa_pair(example):
    prompt = f"<s>[INST]Question: {example['Question']}[/INST]"
    completion = f"Answer: {example['Answer']}</s>"
    return {"text": prompt + completion}

# Format both training and test datasets
train_dataset = dataset["train"].map(format_qa_pair)
test_dataset = dataset["test"].map(format_qa_pair)

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [6]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `HF_Token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `HF_Token`


In [7]:
# Reference: https://huggingface.co/blog/4bit-transformers-bitsandbytes
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="fp4",  # FP4 is generally less accurate but more efficient than NF4.
   bnb_4bit_use_double_quant=False,  # Disabling double quantization can save resources.
   bnb_4bit_compute_dtype=torch.float16  # FP16 is faster than BF16 on many hardware setups.
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",  # Auto-distribute model across GPUs/CPUs
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Additional optimizations for efficiency:
model.config.attn_config = {
    "num_heads": model.config.num_attention_heads // 2  # Reduce attention heads for faster computation
}
model.config.ffn_dim = model.config.hidden_size // 2  # Reduce feed-forward network size
model.config.hidden_dropout_prob = 0.2  # Increase dropout for less overfitting (smaller models benefit)
model.config.attention_probs_dropout_prob = 0.2  # Same for attention weights


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.bos_token, tokenizer.eos_token

('<s>', '</s>')

In [9]:
# LoRA config -- Skeleton
model = prepare_model_for_kbit_training(model)
# Reference: https://huggingface.co/docs/peft/en/quicktour
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=4,
    lora_alpha=16,
    lora_dropout=0.2
)
model = get_peft_model(model, peft_config)

In [10]:
# Hyperparameters -- Skeleton
training_arguments = TrainingArguments(
    output_dir="./ese577_model",
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-4,
    warmup_steps=5,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=8,  # Reduce memory by accumulating gradients
    gradient_checkpointing=True,  # Use gradient checkpointing,
)




In [11]:
# Load pre-defined metrics
metric_f1 = load("f1")
metric_exact_match = load("exact_match")

def compute_metrics(eval_preds):
    """
    Computes Exact Match (EM) and F1 score for the predictions.
    eval_preds: Tuple containing (predictions, labels)
    """
    predictions, references = eval_preds

    # Post-process predictions and references
    processed_predictions = [pred.strip() for pred in predictions]
    processed_references = [ref.strip() for ref in references]

    # Compute metrics
    f1 = metric_f1.compute(predictions=processed_predictions, references=processed_references)
    em = metric_exact_match.compute(predictions=processed_predictions, references=processed_references)

    # Return results as a dictionary
    return {"f1": f1["f1"], "exact_match": em["exact_match"]}


In [12]:
# Update trainer initialization
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    max_seq_length=256,  # Ensure shorter sequences
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,  # Enable dataset packing
    compute_metrics=None  # Disable metrics during training
)
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Epoch,Training Loss,Validation Loss
0,No log,2.007762
1,No log,1.794991
2,No log,1.762761


TrainOutput(global_step=63, training_loss=2.1316000317770336, metrics={'train_runtime': 1186.0613, 'train_samples_per_second': 0.43, 'train_steps_per_second': 0.053, 'total_flos': 5505993941188608.0, 'train_loss': 2.1316000317770336, 'epoch': 2.9647058823529413})

In [13]:
# Save the model
trainer.model.save_pretrained("ESE577_chatbot")
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pr

In [1]:
# Set logging verbosity
logging.set_verbosity(logging.CRITICAL)

# Define models and pipelines
base_pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=256, truncation=True)
finetuned_pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=256, truncation=True)

def build_prompt(question):
    """
    Build a formatted prompt for ESE577 questions.
    """
    return f"<s>[INST]@ESE577. {question}. [/INST]"

def extract_text_from_image(image_path):
    """
    Extract text from an image using OCR (Tesseract).
    """
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

def upload_and_process_image():
    """
    Use Google Colab's upload functionality to handle image input.
    """
    print("Please upload an image file:")
    uploaded = files.upload()  # Upload files

    if not uploaded:
        print("No file uploaded. Please try again.")
        return None, None

    # Get the uploaded file path
    image_path = list(uploaded.keys())[0]
    text = extract_text_from_image(image_path)

    if not text:
        print("Could not extract text from image. Please try again.")
        return None, None

    print("\nExtracted text from image:")
    print(text)
    print()

    return text

def process_text_input():
    """
    Handle text-based questions.
    """
    question = input("Enter your ESE577-related question: ").strip()
    if not question:
        print("No question provided.")
        return None, None
    return question

def get_model_responses(question):
    """
    Generate and display responses from both the base and fine-tuned models.
    """
    prompt = build_prompt(question)

    # Get responses from models
    base_answer = base_pipe(prompt=prompt)
    fine_tuned_answer = finetuned_pipe(prompt=prompt)

    print("\nPretrained Model Response:")
    print(base_answer[0]["generated_text"])

    print("\nFine-Tuned Model Response:")
    print(fine_tuned_answer[0]["generated_text"])

    print()

# Main loop
while True:
    input_type = input("Enter input type ('text'/'image') or press Enter to exit: ").lower().strip()

    if not input_type:
        print("Exiting...")
        break

    if input_type not in ['text', 'image']:
        print("Invalid input type. Please enter 'text' or 'image'.")
        continue

    # Process input type
    if input_type == 'text':
        question = process_text_input()
    else:
        question = upload_and_process_image()

    # Skip iteration if no valid input
    if not question:
        continue

    # Get and display responses
    get_model_responses(question)


NameError: name 'logging' is not defined