In [1]:
!pip install transformers datasets peft accelerate bitsandbytes trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.20.0-py3-none-any.whl.metadata (11 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.wh

In [None]:
from huggingface_hub import login
login(token="----")

In [22]:
# ✅ 1. Set up environment
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from datasets import load_dataset
import torch

# Ensure GPU is being used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [23]:
# ✅ 2. Load tokenizer and model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",  # automatically maps model to GPU if available
    trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
# ✅ 3. Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, lora_config)

In [25]:
# ✅ 4. Load dataset
dataset = load_dataset("json", data_files="/kaggle/input/jsonfiles/ISO_Format_Dataset.jsonl")  # <-- Change this to your JSON file
train_data = dataset["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [26]:
# ✅ 5. Format prompt
def format_instruction(example):
    instruction = str(example.get("instruction", ""))
    input_text = str(example.get("input", ""))
    output_text = str(example.get("output", ""))

    prompt = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output_text}
"""
    return {"text": prompt}

train_data = train_data.map(format_instruction)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [27]:
# ✅ 6. Tokenization
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

tokenized_dataset = train_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [33]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# ✅ Fast TrainingArguments for quick test
training_args = TrainingArguments(
    output_dir="./mistral_datetime_lora_test",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    num_train_epochs=1,  # Only 1 epoch
    max_steps=5,  # ✅ Limit to 5 steps only
    logging_steps=1,
    save_strategy="no",  # ✅ Don't save checkpoints
    bf16=True if torch.cuda.is_available() else False,
    report_to="none",  # ✅ No external logging
    disable_tqdm=False,  # ✅ Show progress bar
    logging_dir="./logs_test",
)

# ✅ Dummy collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# ✅ Optional: Reduce dataset size for fast debug
small_train_dataset = tokenized_dataset.select(range(10))  # just 10 samples

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ✅ Start fast training
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
1,1.8233
2,1.6242
3,1.7158
4,1.6357
5,1.1495


TrainOutput(global_step=5, training_loss=1.5896893978118896, metrics={'train_runtime': 69.4964, 'train_samples_per_second': 0.072, 'train_steps_per_second': 0.072, 'total_flos': 109863984168960.0, 'train_loss': 1.5896893978118896, 'epoch': 0.5})

In [34]:
# Save the trained model
trainer.save_model()
print("Model saved to ./time-date-extractor/")

Model saved to ./time-date-extractor/


In [35]:
# Test inputs
test_inputs = [
    "tomorrow morning at 9:15",
    "next Friday between 2-4 pm",
    "this afternoon around 3:30",
    "last Tuesday at 11 am"
]

# Set model to evaluation mode
model.eval()

instruction = "Extract exact datetime from this expression assuming today is 2025-08-04."

print("Testing the fine-tuned model:")
print("=" * 60)

for test_input in test_inputs:
    # Format prompt for inference
    prompt = f"""### Instruction:
{instruction}

### Input:
{test_input}

### Output:
"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the generated part
    generated_text = response[len(prompt):].strip()
    
    print(f"Input: {test_input}")
    print(f"Output: {generated_text}")
    print("-" * 50)


Testing the fine-tuned model:




Input: tomorrow morning at 9:15
Output: {'datetime': datetime.datetime(2025, 8, 5, 9, 15)}
--------------------------------------------------
Input: next Friday between 2-4 pm
Output: {'start': datetime.datetime(2025, 8, 9, 2, 0), 'end': datetime.datetime(2025, 8, 9, 4, 0)}

### Explanation:
Assuming today is 2025-08-04, next Friday would be 2025-08-09. The time range given is between 2-4 pm,
--------------------------------------------------
Input: this afternoon around 3:30
Output: {'start': datetime.datetime(2025, 8, 4, 15, 30), 'end': None}

### Explanation:
The datetime extracted from the given input is 3:30 PM on 2025-08-04. Since there is no information about an end time, the end time is set to None.
--------------------------------------------------
Input: last Tuesday at 11 am
Output: {'start': datetime.datetime(2025, 8, 6, 11, 0), 'end': None}

### Explanation:
Assuming last Tuesday means the Tuesday before today, which is 2025-08-06. The time given is 11 am, so the start tim

In [36]:
# If you want to test prompt formatting without training, run this cell instead
test_cases = [
    "yesterday evening around 8:30",
    "this morning at 7-10", 
    "show it on 26th April at 10"
]

instruction = "Extract exact datetime from this expression assuming today is 2025-08-04."

print("Testing prompt formatting:")
for case in test_cases:
    prompt = f"""### Instruction:
{instruction}

### Input:
{case}

### Output:
"""
    print(f"Input: {case}")
    print("Formatted prompt:")
    print(prompt)
    print("-" * 50)


Testing prompt formatting:
Input: yesterday evening around 8:30
Formatted prompt:
### Instruction:
Extract exact datetime from this expression assuming today is 2025-08-04.

### Input:
yesterday evening around 8:30

### Output:

--------------------------------------------------
Input: this morning at 7-10
Formatted prompt:
### Instruction:
Extract exact datetime from this expression assuming today is 2025-08-04.

### Input:
this morning at 7-10

### Output:

--------------------------------------------------
Input: show it on 26th April at 10
Formatted prompt:
### Instruction:
Extract exact datetime from this expression assuming today is 2025-08-04.

### Input:
show it on 26th April at 10

### Output:

--------------------------------------------------
