In [15]:
!pip install -U bitsandbytes
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
import torch
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

from datasets import Dataset

# Load and preprocess data
df = pd.read_excel("task_data.xlsx")
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Clean and preprocess text
def preprocess_text(text):
    return text.lower().strip()

df['cleaned_use_cases'] = df['POTENTIAL_USE_CASES'].apply(preprocess_text)
df['cleaned_activity'] = df['ACTIVITY'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['activity_label'] = label_encoder.fit_transform(df['cleaned_activity'])

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare datasets
def prepare_dataset(dataframe):
    return Dataset.from_pandas(dataframe[['cleaned_use_cases', 'activity_label']])

train_dataset = prepare_dataset(train_df)
val_dataset = prepare_dataset(val_df)

access_token = "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"
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b", token=access_token, attention_implementation="eager")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b",
    token=access_token,
    quantization_config=BitsAndBytesConfig(load_in_8bit=True)
)
model = get_peft_model(model, peft_config)  # Add adapter before passing to Trainer




# Tokenize function
def tokenize_function(examples):
    prompts = [f"Use case: {case}\nActivity:" for case in examples["cleaned_use_cases"]]
    inputs = tokenizer(prompts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    labels = tokenizer([f" {label_encoder.inverse_transform([label])[0]}" for label in examples["activity_label"]],
                       padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Changed from evaluation_strategy to eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)
# Initialize Trainer
# Initialize Trainer
trainer = Trainer(
    model=model,  # Pass the model object
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./gemma_activity_classifier")

# Save label encoder
import joblib
joblib.dump(label_encoder, './label_encoder.joblib')

print("Training completed and model saved.")



`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.71 GiB. GPU 0 has a total capacity of 39.56 GiB of which 230.81 MiB is free. Process 38818 has 39.33 GiB memory in use. Of the allocated memory 38.51 GiB is allocated by PyTorch, and 313.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
pip install peft

Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.0-py3-none-any.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.0


In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K