In [None]:
# Install required packages
!pip install datasets evaluate transformers rouge-score nltk torch

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=76fcf15ef087e42790a8e72e4009cac29e7ca8e3365679398511abbf73028963
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.5 rouge-score-0.1.2


In [None]:
import json
# import nltk
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from huggingface_hub import notebook_login

# Optional: Login to Hugging Face Hub to push your model
# notebook_login()


In [None]:
# Load your JSONL dataset
def load_jsonl_dataset(file_path):
    """Load JSONL file and convert to proper format"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line.strip())

            # Extract instruction and response
            instruction = item['instruction']

            # Convert response dict to text format
            response_dict = item['response']
            response_text = ""
            for key, value in response_dict.items():
                if isinstance(value, list):
                    response_text += f"{key.replace('_', ' ').title()}: {'; '.join(value)}. "
                else:
                    response_text += f"{key.replace('_', ' ').title()}: {value}. "

            data.append({
                'instruction': instruction,
                'response': response_text.strip()
            })

    return data

# Load your dataset
dataset_path = '/content/phase3_dataset_900.jsonl'
raw_data = load_jsonl_dataset(dataset_path)

# Convert to HuggingFace Dataset format
dataset = Dataset.from_list(raw_data)

# Split into train/validation (80-20 split)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']
})

print(f"Train size: {len(dataset_dict['train'])}")
print(f"Validation size: {len(dataset_dict['validation'])}")
print(f"Sample: {dataset_dict['train'][0]}")


Train size: 720
Validation size: 180
Sample: {'instruction': 'Flood emergency in Imphal, Manipur near Paona Bazar (795001)! People need urgent help!', 'response': 'Situation Assessment: Flood reported in Imphal, Manipur near Paona Bazar (795001). Flooding covers 48.9 km² with anomaly 24.1%.. Immediate Actions: Alert issued by Police Station.; Mobilize local teams via NH15.. Evacuation And Shelter: Guide residents to School Shelter.; Keep Main Road open for rescue operations.. Medical And Critical Infra: Civil Hospital activated for triage.; Police Station coordinating response.. Long Term Strategies: Enhance GIS-based monitoring systems.; Strengthen local infrastructure for resilience..'}


In [None]:
# Model configuration
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

print(f"Loaded {model_checkpoint}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded t5-small
Tokenizer vocab size: 32100


In [None]:
# For disaster management, we'll use a specific prefix
prefix = "so user query is, respond to it: "

max_input_length = 512
max_target_length = 256  # Increased for your detailed responses

def preprocess_function(examples):
    """Preprocess the dataset for T5 training"""
    # Add prefix to instructions
    inputs = [prefix + instruction for instruction in examples['instruction']]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding=False  # We'll pad in the data collator
    )

    # Tokenize targets (responses)
    labels = tokenizer(
        text_target=examples['response'],
        max_length=max_target_length,
        truncation=True,
        padding=False
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict['train'].column_names
)

print("Preprocessing completed!")
print(f"Sample tokenized input: {tokenized_datasets['train'][0]}")

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Preprocessing completed!
Sample tokenized input: {'input_ids': [78, 1139, 11417, 19, 6, 3531, 12, 34, 10, 28391, 3583, 16, 1318, 21367, 6, 1140, 23, 3791, 1084, 2709, 106, 9, 2659, 7061, 41, 4440, 2560, 6982, 55, 2449, 174, 10839, 199, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [15494, 15186, 10, 28391, 2196, 16, 1318, 21367, 6, 1140, 23, 3791, 1084, 2709, 106, 9, 2659, 7061, 41, 4440, 2560, 13883, 28391, 53, 3792, 4678, 5, 1298, 2280, 357, 28, 23236, 63, 997, 5, 4704, 5, 5, 1318, 5700, 342, 6776, 7, 10, 23951, 4683, 57, 5076, 5939, 5, 117, 1290, 3727, 1737, 415, 2323, 1009, 3, 15743, 1808, 5, 5, 17627, 1071, 257, 275, 30415, 10, 4637, 2797, 12, 1121, 30415, 5, 117, 3521, 5140, 2409, 539, 21, 9635, 2673, 5, 5, 3721, 275, 23208, 86, 89, 52, 9, 10, 7707, 4457, 8195, 26, 21, 6467, 545, 5, 117, 5076, 5939, 3, 24232, 1773, 5, 5, 3230, 3, 11679, 28026, 10, 27190, 15, 350, 4555, 18, 390, 4891, 

In [None]:
# Training configuration
batch_size = 8  # Adjust based on your GPU memory
model_name = model_checkpoint.split("/")[-1]
import torch
training_args = Seq2SeqTrainingArguments(
    output_dir             = './new_model_v1',
    num_train_epochs       = 5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size  = 2,
    learning_rate          = 3e-5,
    logging_dir            = 'new_model_v1/logs',
    logging_steps          = 50,
    eval_strategy          = "epoch",
    save_strategy          = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model  = "eval_loss",
    predict_with_generate  = True,
    generation_max_length  = 128, # Keep this as it was before
    generation_num_beams   = 1,  # Set to 1 for greedy decoding during evaluation
    report_to              = "none",
    fp16                   = torch.cuda.is_available()
)

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Data collator for seq2seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Load ROUGE metric
metric = load("rouge")

def compute_metrics(eval_pred):
    """Compute ROUGE metrics for evaluation"""
    predictions, labels = eval_pred

    # Replace -100s in labels (they are ignored in loss computation)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Filter out invalid token IDs in predictions before decoding
    valid_predictions = []
    for pred_seq in predictions:
        valid_pred_seq = [token_id for token_id in pred_seq if token_id >= 0 and token_id < tokenizer.vocab_size]
        valid_predictions.append(valid_pred_seq)


    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(valid_predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_special_tokens=True)

    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        use_aggregator=True
    )

    # Extract scores
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer, # Keep tokenizer here for data collator and other uses
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training...")
trainer.train()

# Save the final model
trainer.save_model("./colab_new_model_v1")
print("Training completed and model saved!")

  trainer = Seq2SeqTrainer(


Starting training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.5035,0.119925,79.4179,77.9843,79.3574,79.3378,119.1944
2,0.1263,0.057805,92.3078,90.7388,92.2528,92.2588,126.3222
3,0.0846,0.054211,92.65,90.8229,92.5521,92.5551,126.5056
4,0.0774,0.053138,92.7381,90.9454,92.6353,92.6395,126.4611
5,0.0773,0.052849,92.7517,90.9424,92.6527,92.6576,126.5056


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed and model saved!


In [None]:
# Replace 'your_folder_name' with the name of the folder you want to download
folder_to_download = '/content/new_model_v1'

# Zip the folder
!zip -r {folder_to_download}.zip {folder_to_download}

# Download the zipped folder
from google.colab import files
files.download(f'{folder_to_download}.zip')

  adding: content/new_model_v1/ (stored 0%)
  adding: content/new_model_v1/checkpoint-1800/ (stored 0%)
  adding: content/new_model_v1/checkpoint-1800/spiece.model (deflated 48%)
  adding: content/new_model_v1/checkpoint-1800/model.safetensors (deflated 10%)
  adding: content/new_model_v1/checkpoint-1800/config.json (deflated 63%)
  adding: content/new_model_v1/checkpoint-1800/optimizer.pt (deflated 8%)
  adding: content/new_model_v1/checkpoint-1800/training_args.bin (deflated 54%)
  adding: content/new_model_v1/checkpoint-1800/generation_config.json (deflated 28%)
  adding: content/new_model_v1/checkpoint-1800/special_tokens_map.json (deflated 85%)
  adding: content/new_model_v1/checkpoint-1800/scheduler.pt (deflated 62%)
  adding: content/new_model_v1/checkpoint-1800/rng_state.pth (deflated 26%)
  adding: content/new_model_v1/checkpoint-1800/tokenizer.json (deflated 74%)
  adding: content/new_model_v1/checkpoint-1800/tokenizer_config.json (deflated 95%)
  adding: content/new_model_v1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# NEW MODEL TRAINING

In [None]:
# Cell 1 — Install libraries (run once)
!pip install -qU transformers datasets accelerate evaluate rouge-score sentencepiece sacrebleu

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
# Cell 3 — Imports & GPU check
import os, json, math, shutil, glob, torch
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer)
import evaluate
import numpy as np
import matplotlib.pyplot as plt

print("Torch:", torch.__version__)
print("Cuda available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu")

Torch: 2.8.0+cu126
Cuda available: True
Device: Tesla T4


In [None]:
# Cell 4 — Config (edit paths / hyperparams here)
DATA_PATH = "./cleaned_dataset.jsonl"   # <-- set this
OUTPUT_DIR = "./Working-Model"      # <-- where to save weights
MODEL_NAME = "google/flan-t5-small"   # lightweight, good for T4
PREFIX = "Interpret and respond (JSON): "  # optional prefix to help model

# Training hyperparams tuned for T4 (16GB); reduce batch if OOM
NUM_EPOCHS = 4
PER_DEVICE_BATCH = 4        # try 4; lower if OOM
GRAD_ACCUM = 2              # effective batch = PER_DEVICE_BATCH * GRAD_ACCUM
LEARNING_RATE = 3e-5
MAX_INPUT_LEN = 256
MAX_TARGET_LEN = 1000        # JSON outputs can be long: enlarge as needed
SEED = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# Cell 5 — Load dataset (expects each line JSON with fields: instruction, context, response)
# We'll convert each training example into:
#   input_text = PREFIX + instruction
#   target_text = json.dumps({"instruction": instruction, "context": context, "response": response}, ensure_ascii=False)

def load_jsonl_to_hf(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for ln in f:
            ln = ln.strip()
            if not ln:
                continue
            try:
                obj = json.loads(ln)
            except Exception as e:
                # skip malformed
                continue
            instr = obj.get("instruction") or obj.get("input") or ""
            ctx = obj.get("context", {})
            resp = obj.get("response", {})
            # ensure response is JSON-serializable
            target_obj = {
                "instruction": instr,
                "context": ctx,
                "response": resp
            }
            rows.append({"instruction": instr, "target_json": json.dumps(target_obj, ensure_ascii=False)})
    return Dataset.from_list(rows)

ds = load_jsonl_to_hf(DATA_PATH)
print("Total rows loaded:", len(ds))
# quick split
ds = ds.train_test_split(test_size=0.1, seed=SEED)
print("Train:", len(ds['train']), "Val:", len(ds['test']))


Total rows loaded: 900
Train: 810 Val: 90


In [None]:
# Cell 6 — Tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Add special tokens if needed (rare)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"<pad>"})
    model.resize_token_embeddings(len(tokenizer))

print("Loaded model & tokenizer:", MODEL_NAME)


Loaded model & tokenizer: google/flan-t5-small


In [None]:
# Cell 7 — Preprocessing / tokenization functions
def preprocess_batch(examples):
    inputs = [PREFIX + ins for ins in examples["instruction"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True, padding="max_length")

    # Targets are pre-built JSON strings
    targets = examples["target_json"]
    labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LEN, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map tokenization (batched)
tokenized = ds.map(preprocess_batch, batched=True, remove_columns=ds['train'].column_names)
print("Tokenization done. Example input ids shape:", len(tokenized['train'][0]['input_ids']))


Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Tokenization done. Example input ids shape: 256


In [None]:
# Constants
prefix = "so user query is, respond to it: "
max_input_length = 512
max_target_length = 256

def preprocess_function(examples):
    """Preprocess the dataset for T5 with robust label handling"""
    # 1️⃣ Prepare inputs with prefix
    inputs = [prefix + instruction for instruction in examples["instruction"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding=False
    )

    # 2️⃣ Tokenize responses
    raw_labels = examples["response"]

    # Convert response (dicts/lists/strings) → clean string
    def safe_str(x):
        if isinstance(x, dict):
            return json.dumps(x, ensure_ascii=False)
        if isinstance(x, list):
            return "; ".join(map(str, x))
        return str(x)

    labels_text = [safe_str(r) for r in raw_labels]

    tokenized_labels = tokenizer(
        text_target=labels_text,
        max_length=max_target_length,
        truncation=True,
        padding=False
    )["input_ids"]

    # 3️⃣ Sanitize token IDs (avoid out-of-range / empty labels)
    cleaned_labels = []
    for seq in tokenized_labels:
        seq = [tid if 0 <= tid < tokenizer.vocab_size else tokenizer.pad_token_id for tid in seq]
        if not seq:  # fallback if empty
            seq = [tokenizer.pad_token_id]
        cleaned_labels.append(seq)

    model_inputs["labels"] = cleaned_labels
    return model_inputs


In [None]:
# Cell 8 — Data collator and metrics
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# We'll track ROUGE (optional) and rely on trainer logs for loss
# rouge = evaluate.load("rouge")
from evaluate import load
metric = load("rouge")

def compute_metrics(eval_pred):
    """Compute ROUGE metrics safely (avoid overflow)"""
    predictions, labels = eval_pred

    # Replace -100s in labels (they are ignored in loss computation)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Sanitize predictions: remove invalid token IDs
    valid_predictions = []
    for pred_seq in predictions:
        valid_seq = [tid for tid in pred_seq if 0 <= tid < tokenizer.vocab_size]
        valid_predictions.append(valid_seq)

    # Decode
    decoded_preds = tokenizer.batch_decode(valid_predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects newlines between sentences
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}

    # Add average generation length
    prediction_lens = [len(seq) for seq in valid_predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return result



In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir             = './new_model_v1',
    num_train_epochs       = 5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size  = 2,
    learning_rate          = 3e-5,
    logging_dir            = './model_logs',
    logging_steps          = 50,
    eval_strategy          = "epoch",
    save_strategy          = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model  = "eval_loss",
    predict_with_generate  = True,
    generation_max_length  = 1000, # Keep this as it was before
    generation_num_beams   = 1,  # Set to 1 for greedy decoding during evaluation
    report_to              = "none",
    fp16                   = torch.cuda.is_available()
)

In [None]:
# Cell 10 — Trainer init & resume detection
from transformers import Seq2SeqTrainer

# find latest checkpoint if exists
def latest_checkpoint(out_dir):
    ckpts = sorted([d for d in glob.glob(os.path.join(out_dir, "checkpoint-*"))])
    return ckpts[-1] if ckpts else None

ckpt = latest_checkpoint(OUTPUT_DIR)
if ckpt:
    print("Resuming from checkpoint:", ckpt)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Cell 11 — Start training (this will resume if checkpoint exists)
trainer.train(resume_from_checkpoint=ckpt)
# Save final model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training finished. Model saved to:", OUTPUT_DIR)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,7.9388,5.1286,7.3549,7.7992,304.933333
2,0.0,,7.9388,5.1286,7.3549,7.7992,304.933333


KeyboardInterrupt: 

In [None]:
# mymodeltraining.py

!pip install -q datasets evaluate transformers rouge-score nltk torch

import os, json
import numpy as np
import nltk
import torch
from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

nltk.download("punkt")

# ----------------------------
# LOAD DATASET
# ----------------------------
def load_jsonl_dataset(file_path):
    """Load JSONL and flatten response dicts into text"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line.strip())
            instruction = item["instruction"]
            response = item["response"]

            # Convert response dict → string
            if isinstance(response, dict):
                response_text = ""
                for k, v in response.items():
                    if isinstance(v, list):
                        response_text += f"{k.replace('_',' ').title()}: {'; '.join(map(str,v))}. "
                    else:
                        response_text += f"{k.replace('_',' ').title()}: {v}. "
                response = response_text.strip()
            elif isinstance(response, list):
                response = "; ".join(map(str, response))
            else:
                response = str(response)

            data.append({"instruction": instruction, "response": response})
    return data

dataset_path = "/content/cleaned_dataset.jsonl"  # <<-- update path
raw_data = load_jsonl_dataset(dataset_path)

dataset = Dataset.from_list(raw_data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

print("Sample:", dataset_dict["train"][0])

# ----------------------------
# MODEL + TOKENIZER
# ----------------------------
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

prefix = "so user query is, respond to it: "
max_input_length = 1000
max_target_length = 1000

# ----------------------------
# PREPROCESSING FUNCTION
# ----------------------------
def preprocess_function(examples):
    inputs = [prefix + instr for instr in examples["instruction"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding=False,
    )

    raw_labels = examples["response"]

    def safe_str(x):
        if isinstance(x, dict):
            return json.dumps(x, ensure_ascii=False)
        if isinstance(x, list):
            return "; ".join(map(str, x))
        return str(x)

    labels_text = [safe_str(r) for r in raw_labels]

    labels = tokenizer(
        text_target=labels_text,
        max_length=max_target_length,
        truncation=True,
        padding=False,
    )["input_ids"]

    # Sanitize token IDs
    cleaned_labels = []
    for seq in labels:
        seq = [tid if 0 <= tid < tokenizer.vocab_size else tokenizer.pad_token_id for tid in seq]
        if not seq:
            seq = [tokenizer.pad_token_id]
        cleaned_labels.append(seq)

    model_inputs["labels"] = cleaned_labels
    return model_inputs

tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
)

print("Tokenized sample:", tokenized_datasets["train"][0])

# ----------------------------
# TRAINING CONFIG
# ----------------------------
OUTPUT_DIR = "./disaster_t5_model"

# training_args = Seq2SeqTrainingArguments(
#     output_dir=OUTPUT_DIR,
#     num_train_epochs=5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     learning_rate=5e-5,
#     logging_dir=f"{OUTPUT_DIR}/logs",
#     logging_steps=50,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     predict_with_generate=True,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     generation_max_length=256,
#     report_to="none",
#     fp16=torch.cuda.is_available(),
# )
training_args = Seq2SeqTrainingArguments(
    output_dir             = './new_model_v1',
    num_train_epochs       = 5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size  = 2,
    learning_rate          = 3e-5,
    logging_dir            = './model_logs',
    logging_steps          = 50,
    eval_strategy          = "epoch",
    save_strategy          = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model  = "eval_loss",
    predict_with_generate  = True,
    generation_max_length  = 1000, # Keep this as it was before
    generation_num_beams   = 1,  # Set to 1 for greedy decoding during evaluation
    report_to              = "none",
    fp16                   = torch.cuda.is_available()
)

# ----------------------------
# DATA COLLATOR
# ----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ----------------------------
# METRICS
# ----------------------------
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    result["gen_len"] = np.mean([len(p.split()) for p in decoded_preds])
    return result

# ----------------------------
# TRAINER
# ----------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ----------------------------
# TRAINING
# ----------------------------
ckpt = None  # or set to checkpoint path if resuming
trainer.train(resume_from_checkpoint=ckpt)

# Save final model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("✅ Training completed. Model saved at:", OUTPUT_DIR)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Sample: {'instruction': 'Flood emergency in Imphal, Manipur near Paona Bazar (795001)! People need urgent help!', 'response': 'Situation Assessment: Flood reported in Imphal, Manipur near Paona Bazar (795001). Flooding covers 48.9 km² with anomaly 24.1%.. Immediate Actions: Alert issued by Police Station.; Mobilize local teams via NH15.. Evacuation And Shelter: Guide residents to School Shelter.; Keep Main Road open for rescue operations.. Medical And Critical Infra: Civil Hospital activated for triage.; Police Station coordinating response.. Long Term Strategies: Enhance GIS-based monitoring systems.; Strengthen local infrastructure for resilience..'}


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Tokenized sample: {'input_ids': [78, 1139, 11417, 19, 6, 3531, 12, 34, 10, 28391, 3583, 16, 1318, 21367, 6, 1140, 23, 3791, 1084, 2709, 106, 9, 2659, 7061, 41, 4440, 2560, 6982, 55, 2449, 174, 10839, 199, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [15494, 15186, 10, 28391, 2196, 16, 1318, 21367, 6, 1140, 23, 3791, 1084, 2709, 106, 9, 2659, 7061, 41, 4440, 2560, 13883, 28391, 53, 3792, 4678, 5, 1298, 2280, 357, 28, 23236, 63, 997, 5, 4704, 5, 5, 1318, 5700, 342, 6776, 7, 10, 23951, 4683, 57, 5076, 5939, 5, 117, 1290, 3727, 1737, 415, 2323, 1009, 3, 15743, 1808, 5, 5, 17627, 1071, 257, 275, 30415, 10, 4637, 2797, 12, 1121, 30415, 5, 117, 3521, 5140, 2409, 539, 21, 9635, 2673, 5, 5, 3721, 275, 23208, 86, 89, 52, 9, 10, 7707, 4457, 8195, 26, 21, 6467, 545, 5, 117, 5076, 5939, 3, 24232, 1773, 5, 5, 3230, 3, 11679, 28026, 10, 27190, 15, 350, 4555, 18, 390, 4891, 1002, 5, 117, 25243, 35, 415, 3

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


OverflowError: out of range integral type conversion attempted

In [None]:
!pip install -q datasets evaluate transformers rouge-score nltk torch

import os, json
import numpy as np
import nltk
import torch
from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

nltk.download("punkt")

# ----------------------------
# LOAD DATASET
# ----------------------------
def load_jsonl_dataset(file_path):
    """Load JSONL and flatten response dicts into text"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line.strip())
            instruction = item["instruction"]
            response = item["response"]

            # Convert response dict → string
            if isinstance(response, dict):
                response_text = ""
                for k, v in response.items():
                    if isinstance(v, list):
                        response_text += f"{k.replace('_',' ').title()}: {'; '.join(map(str,v))}. "
                    else:
                        response_text += f"{k.replace('_',' ').title()}: {v}. "
                response = response_text.strip()
            elif isinstance(response, list):
                response = "; ".join(map(str, response))
            else:
                response = str(response)

            data.append({"instruction": instruction, "response": response})
    return data

dataset_path = "./cleaned_dataset.jsonl"  # <-- update path
raw_data = load_jsonl_dataset(dataset_path)

dataset = Dataset.from_list(raw_data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"]
})

print("Sample:", dataset_dict["train"][0])

# ----------------------------
# MODEL + TOKENIZER
# ----------------------------
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

prefix = "so user query is, respond to it: "
max_input_length = 512
max_target_length = 256

# ----------------------------
# PREPROCESSING FUNCTION
# ----------------------------
def preprocess_function(examples):
    inputs = [prefix + instr for instr in examples["instruction"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding=False,
    )

    raw_labels = examples["response"]

    def safe_str(x):
        if isinstance(x, dict):
            return json.dumps(x, ensure_ascii=False)
        if isinstance(x, list):
            return "; ".join(map(str, x))
        return str(x)

    labels_text = [safe_str(r) for r in raw_labels]
    labels = tokenizer(
        text_target=labels_text,
        max_length=max_target_length,
        truncation=True,
        padding=False,
    )["input_ids"]

    # Sanitize token IDs
    cleaned_labels = []
    for seq in labels:
        seq = [tid if 0 <= tid < tokenizer.vocab_size else tokenizer.pad_token_id for tid in seq]
        if not seq:
            seq = [tokenizer.pad_token_id]
        cleaned_labels.append(seq)

    model_inputs["labels"] = cleaned_labels
    return model_inputs

tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
)

print("Tokenized sample:", tokenized_datasets["train"][0])

# ----------------------------
# TRAINING CONFIG
# ----------------------------
OUTPUT_DIR = "./disaster_t5_model"

training_args = Seq2SeqTrainingArguments(
    output_dir             = './OUTPUT_DIR',
    num_train_epochs       = 5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size  = 2,
    learning_rate          = 3e-5,
    logging_dir            = './model_logs',
    logging_steps          = 50,
    eval_strategy          = "epoch",
    save_strategy          = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model  = "eval_loss",
    predict_with_generate  = True,
    generation_max_length  = 1000, # Keep this as it was before
    generation_num_beams   = 1,  # Set to 1 for greedy decoding during evaluation
    report_to              = "none",
    fp16                   = torch.cuda.is_available()
)

# ----------------------------
# DATA COLLATOR
# ----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ----------------------------
# METRICS
# ----------------------------
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # 🔹 Sanitize predictions to avoid OverflowError
    cleaned_preds = []
    for seq in predictions:
        cleaned_seq = [tid for tid in seq if 0 <= tid < tokenizer.vocab_size]
        if not cleaned_seq:
            cleaned_seq = [tokenizer.pad_token_id]
        cleaned_preds.append(cleaned_seq)

    # Decode
    decoded_preds = tokenizer.batch_decode(cleaned_preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    result["gen_len"] = np.mean([len(p.split()) for p in decoded_preds])

    return result

# ----------------------------
# TRAINER
# ----------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ----------------------------
# TRAINING
# ----------------------------
ckpt = None  # or path to checkpoint if resuming
trainer.train(resume_from_checkpoint=ckpt)

# Save final model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ Training completed. Model saved at:", OUTPUT_DIR)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sample: {'instruction': 'Flood emergency in Imphal, Manipur near Paona Bazar (795001)! People need urgent help!', 'response': 'Situation Assessment: Flood reported in Imphal, Manipur near Paona Bazar (795001). Flooding covers 48.9 km² with anomaly 24.1%.. Immediate Actions: Alert issued by Police Station.; Mobilize local teams via NH15.. Evacuation And Shelter: Guide residents to School Shelter.; Keep Main Road open for rescue operations.. Medical And Critical Infra: Civil Hospital activated for triage.; Police Station coordinating response.. Long Term Strategies: Enhance GIS-based monitoring systems.; Strengthen local infrastructure for resilience..'}


Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Tokenized sample: {'input_ids': [78, 1139, 11417, 19, 6, 3531, 12, 34, 10, 28391, 3583, 16, 1318, 21367, 6, 1140, 23, 3791, 1084, 2709, 106, 9, 2659, 7061, 41, 4440, 2560, 6982, 55, 2449, 174, 10839, 199, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [15494, 15186, 10, 28391, 2196, 16, 1318, 21367, 6, 1140, 23, 3791, 1084, 2709, 106, 9, 2659, 7061, 41, 4440, 2560, 13883, 28391, 53, 3792, 4678, 5, 1298, 2280, 357, 28, 23236, 63, 997, 5, 4704, 5, 5, 1318, 5700, 342, 6776, 7, 10, 23951, 4683, 57, 5076, 5939, 5, 117, 1290, 3727, 1737, 415, 2323, 1009, 3, 15743, 1808, 5, 5, 17627, 1071, 257, 275, 30415, 10, 4637, 2797, 12, 1121, 30415, 5, 117, 3521, 5140, 2409, 539, 21, 9635, 2673, 5, 5, 3721, 275, 23208, 86, 89, 52, 9, 10, 7707, 4457, 8195, 26, 21, 6467, 545, 5, 117, 5076, 5939, 3, 24232, 1773, 5, 5, 3230, 3, 11679, 28026, 10, 27190, 15, 350, 4555, 18, 390, 4891, 1002, 5, 117, 25243, 35, 415, 3

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.5035,0.119925,83.3706,81.9703,83.4133,83.4128,68.15
2,0.1263,0.057805,97.0098,95.4516,96.9597,96.9723,73.972222
3,0.0846,0.054211,97.3182,95.4664,97.2232,97.2245,74.105556
4,0.0774,0.053138,97.4076,95.5843,97.2994,97.3067,74.105556
5,0.0773,0.052849,97.3852,95.5438,97.2826,97.2855,74.105556


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Training completed. Model saved at: ./disaster_t5_model


In [None]:
folder_to_download = '/content/disaster_t5_model'

# Zip the folder
!zip -r {folder_to_download}.zip {folder_to_download}

# Download the zipped folder
from google.colab import files
files.download(f'{folder_to_download}.zip')

  adding: content/disaster_t5_model/ (stored 0%)
  adding: content/disaster_t5_model/spiece.model (deflated 48%)
  adding: content/disaster_t5_model/model.safetensors (deflated 10%)
  adding: content/disaster_t5_model/config.json (deflated 63%)
  adding: content/disaster_t5_model/training_args.bin (deflated 54%)
  adding: content/disaster_t5_model/generation_config.json (deflated 28%)
  adding: content/disaster_t5_model/special_tokens_map.json (deflated 85%)
  adding: content/disaster_t5_model/tokenizer.json (deflated 74%)
  adding: content/disaster_t5_model/tokenizer_config.json (deflated 95%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
00-[-'/']i8i88nb nbgyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy