In [2]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast

In [3]:
#****Testing if it works despite the pip warnings ****
import torch
from transformers import DistilBertTokenizerFast, DistilBertModel

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
print("DistilBERT loaded ✅")


2025-07-01 09:13:26.551912: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751361206.746603      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751361206.805195      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Torch: 2.6.0+cu124
CUDA available: True


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBERT loaded ✅


In [5]:
# Reimport if needed
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Re-create crisis report DataFrame
data = {
    "text": [
        "Elderly man collapsed in Noida clinic.",
        "Need urgent food in flood relief camp in Guwahati.",
        "Rescue needed for family stuck in waterlogged building.",
        "Storm destroyed homes. Need shelter in Cuttack.",
        "Medical emergency: woman fainted during heatwave.",
        "Children stranded without food in Chennai slums.",
        "People stuck on rooftop in Assam.",
        "No medicine available in clinic near Lajpat Nagar.",
        "Request for shelter after cyclone hit Orissa.",
        "Need drinking water and food packets in Bihar village.",
        "Collapsed house in landslide — people injured.",
        "Camp needs volunteers for elderly care.",
        "Overcrowded shelter in Bhopal — need mattresses.",
        "Power and water outage in rain-affected area.",
        "Medical staff needed in rural health center.",
        "Flood victims need cooked meals in Assam.",
        "Doctors required in mobile ambulance unit.",
        "Families cold, without blankets in shelter zone.",
        "Urgent food supply needed in Kolkata outskirts.",
        "Ambulance stuck in traffic, patient critical."
    ],
    "type": [
        "Medical", "Food", "Rescue", "Shelter", "Medical",
        "Food", "Rescue", "Medical", "Shelter", "Food",
        "Rescue", "Medical", "Shelter", "Other", "Medical",
        "Food", "Medical", "Shelter", "Food", "Medical"
    ],
    "urgency": [
        "High", "High", "High", "Medium", "High",
        "High", "High", "Medium", "Medium", "Medium",
        "High", "Medium", "Low", "Low", "Medium",
        "Medium", "High", "Medium", "Medium", "High"
    ]
}

df = pd.DataFrame(data)

# Apply label encoding
type_encoder = LabelEncoder()
urgency_encoder = LabelEncoder()

df["type_encoded"] = type_encoder.fit_transform(df["type"])
df["urgency_encoded"] = urgency_encoder.fit_transform(df["urgency"])

# Confirm it's fixed
print("✅ Encoded columns created!")
print(df.head())


✅ Encoded columns created!
                                                text     type urgency  \
0             Elderly man collapsed in Noida clinic.  Medical    High   
1  Need urgent food in flood relief camp in Guwah...     Food    High   
2  Rescue needed for family stuck in waterlogged ...   Rescue    High   
3    Storm destroyed homes. Need shelter in Cuttack.  Shelter  Medium   
4  Medical emergency: woman fainted during heatwave.  Medical    High   

   type_encoded  urgency_encoded  
0             1                0  
1             0                0  
2             3                0  
3             4                2  
4             1                0  


In [6]:
from torch.utils.data import Dataset
import torch

class CrisisDataset(Dataset):
    def __init__(self, texts, type_labels, urgency_labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
        self.labels_type = torch.tensor(type_labels)
        self.labels_urgency = torch.tensor(urgency_labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels_type"] = self.labels_type[idx]
        item["labels_urgency"] = self.labels_urgency[idx]
        return item

    def __len__(self):
        return len(self.labels_type)


In [7]:
# Recreate dataset
dataset = CrisisDataset(
    texts=df["text"].tolist(),
    type_labels=df["type_encoded"].tolist(),
    urgency_labels=df["urgency_encoded"].tolist(),
    tokenizer=tokenizer
)

# Recreate dataloader
from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [8]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import torch.nn as nn
from transformers import DistilBertModel

# Rebuild the classifier model if needed
class CrisisClassifier(nn.Module):
    def __init__(self, num_types, num_urgencies):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.type_head = nn.Linear(self.bert.config.hidden_size, num_types)
        self.urgency_head = nn.Linear(self.bert.config.hidden_size, num_urgencies)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(output.last_hidden_state[:, 0])
        return self.type_head(pooled), self.urgency_head(pooled)

# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = CrisisClassifier(
    num_types=len(type_encoder.classes_),
    num_urgencies=len(urgency_encoder.classes_)
).to(device)

# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# Training loop
for epoch in range(150):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_type = batch["labels_type"].to(device)
        labels_urgency = batch["labels_urgency"].to(device)

        out_type, out_urgency = model(input_ids, attention_mask)
        loss_type = loss_fn(out_type, labels_type)
        loss_urgency = loss_fn(out_urgency, labels_urgency)
        loss = loss_type + loss_urgency

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} — Loss: {total_loss:.4f}")


Epoch 1 — Loss: 13.2671
Epoch 2 — Loss: 12.2461
Epoch 3 — Loss: 10.9518
Epoch 4 — Loss: 9.9822
Epoch 5 — Loss: 8.8285
Epoch 6 — Loss: 7.2396
Epoch 7 — Loss: 5.9548
Epoch 8 — Loss: 4.3966
Epoch 9 — Loss: 3.7855
Epoch 10 — Loss: 2.9365
Epoch 11 — Loss: 2.3064
Epoch 12 — Loss: 1.8674
Epoch 13 — Loss: 1.5039
Epoch 14 — Loss: 1.2751
Epoch 15 — Loss: 1.0021
Epoch 16 — Loss: 0.9229
Epoch 17 — Loss: 0.7505
Epoch 18 — Loss: 0.7112
Epoch 19 — Loss: 0.5918
Epoch 20 — Loss: 0.5447
Epoch 21 — Loss: 0.4532
Epoch 22 — Loss: 0.5115
Epoch 23 — Loss: 0.3894
Epoch 24 — Loss: 0.3648
Epoch 25 — Loss: 0.3231
Epoch 26 — Loss: 0.3548
Epoch 27 — Loss: 0.3577
Epoch 28 — Loss: 0.3120
Epoch 29 — Loss: 0.2672
Epoch 30 — Loss: 0.2457
Epoch 31 — Loss: 0.2616
Epoch 32 — Loss: 0.2569
Epoch 33 — Loss: 0.2145
Epoch 34 — Loss: 0.2034
Epoch 35 — Loss: 0.2147
Epoch 36 — Loss: 0.2035
Epoch 37 — Loss: 0.1992
Epoch 38 — Loss: 0.1732
Epoch 39 — Loss: 0.1824
Epoch 40 — Loss: 0.1827
Epoch 41 — Loss: 0.1807
Epoch 42 — Loss: 0.156

In [9]:
# 🔍 Sample unseen crisis reports
def classify_crisis(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        out_type, out_urgency = model(**inputs)
        type_pred = torch.argmax(out_type, dim=1).item()
        urgency_pred = torch.argmax(out_urgency, dim=1).item()

    return {
        "text": text,
        "type": type_encoder.inverse_transform([type_pred])[0],
        "urgency": urgency_encoder.inverse_transform([urgency_pred])[0]
    }


new_reports = [
    "Flood waters rising in Siliguri, 3 families trapped on rooftop.",
    "Urgent requirement of medical aid for collapsed patient in Kanpur.",
    "Children hungry, no food for 2 days in relief shelter, Bihar.",
    "Need shelter urgently after heavy rains in Meghalaya.",
    "Water pipes broken, entire slum has no clean drinking water.",
    "Food packets needed in Malda. People haven’t eaten since 2 days.",
    "Flood rescue team needed in Aligarh. Locals stranded.",
    "Doctor unavailable in community clinic. Multiple patients fainted.",
    "Request for blankets and warm clothes in hilly areas of Shimla.",
    "Family of 4 needs urgent evacuation from waterlogged home in Cuttack."
]

# Batch classify and collect results
results = []

for report in new_reports:
    res = classify_crisis(report)
    results.append(res)

# Convert to DataFrame
df_preds = pd.DataFrame(results)
df_preds


Unnamed: 0,text,type,urgency
0,"Flood waters rising in Siliguri, 3 families tr...",Rescue,High
1,Urgent requirement of medical aid for collapse...,Medical,High
2,"Children hungry, no food for 2 days in relief ...",Food,Medium
3,Need shelter urgently after heavy rains in Meg...,Shelter,Medium
4,"Water pipes broken, entire slum has no clean d...",Medical,Medium
5,Food packets needed in Malda. People haven’t e...,Food,Medium
6,Flood rescue team needed in Aligarh. Locals st...,Food,High
7,Doctor unavailable in community clinic. Multip...,Medical,High
8,Request for blankets and warm clothes in hilly...,Shelter,Medium
9,Family of 4 needs urgent evacuation from water...,Medical,High


**Geolocation**

In [10]:
!pip install -q geopy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
from geopy.geocoders import Nominatim
import time

geolocator = Nominatim(user_agent="crisiscompass")

def get_coordinates(location_text):
    try:
        location = geolocator.geocode(location_text, timeout=10)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except:
        return None, None

In [12]:
# Simple heuristic: extract last place-like word (manually or via LLM later)
df_preds["place"] = df_preds["text"].str.extract(r"in ([A-Z][a-zA-Z]+)")

# Apply geolocation
df_preds[["latitude", "longitude"]] = df_preds["place"].apply(lambda x: pd.Series(get_coordinates(x)))

df_preds


Unnamed: 0,text,type,urgency,place,latitude,longitude
0,"Flood waters rising in Siliguri, 3 families tr...",Rescue,High,Siliguri,26.716413,88.430992
1,Urgent requirement of medical aid for collapse...,Medical,High,Kanpur,26.460914,80.321759
2,"Children hungry, no food for 2 days in relief ...",Food,Medium,,34.220389,70.380031
3,Need shelter urgently after heavy rains in Meg...,Shelter,Medium,Meghalaya,25.537943,91.29991
4,"Water pipes broken, entire slum has no clean d...",Medical,Medium,,34.220389,70.380031
5,Food packets needed in Malda. People haven’t e...,Food,Medium,Malda,25.005745,88.139848
6,Flood rescue team needed in Aligarh. Locals st...,Food,High,Aligarh,27.876107,78.135815
7,Doctor unavailable in community clinic. Multip...,Medical,High,,34.220389,70.380031
8,Request for blankets and warm clothes in hilly...,Shelter,Medium,,34.220389,70.380031
9,Family of 4 needs urgent evacuation from water...,Medical,High,Cuttack,20.4686,85.8792


****RETRAINING ON NEW DATASET****

In [13]:
import pandas as pd

df_synthetic = pd.read_csv("/kaggle/input/1000-entries-synthetic-dataset/synthetic_crisis_1000.csv")

df_synthetic.head()


Unnamed: 0,text,type,urgency,location
0,Lack of basic medical services reported in Jha...,Medical,Medium,Jharkhand
1,"Local authorities in Goa report food issues, r...",Food,Medium,Goa
2,Urgent Medical assistance required in Gujarat.,Medical,Medium,Gujarat
3,Crisis alert: Shelter required in Madhya Prade...,Shelter,High,Madhya Pradesh
4,"Food emergency reported in Odisha, situation i...",Food,Medium,Odisha


In [14]:
from sklearn.preprocessing import LabelEncoder

type_encoder = LabelEncoder()
urgency_encoder = LabelEncoder()

df_synthetic["type_encoded"] = type_encoder.fit_transform(df_synthetic["type"])
df_synthetic["urgency_encoded"] = urgency_encoder.fit_transform(df_synthetic["urgency"])


In [15]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

class CrisisDataset(Dataset):
    def __init__(self, texts, type_labels, urgency_labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
        self.labels_type = torch.tensor(type_labels)
        self.labels_urgency = torch.tensor(urgency_labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels_type"] = self.labels_type[idx]
        item["labels_urgency"] = self.labels_urgency[idx]
        return item

    def __len__(self):
        return len(self.labels_type)

dataset = CrisisDataset(
    texts=df_synthetic["text"].tolist(),
    type_labels=df_synthetic["type_encoded"].tolist(),
    urgency_labels=df_synthetic["urgency_encoded"].tolist(),
    tokenizer=tokenizer
)

loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [17]:
import torch.nn as nn
from transformers import DistilBertModel
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

class CrisisClassifier(nn.Module):
    def __init__(self, num_types, num_urgencies):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.type_head = nn.Linear(self.bert.config.hidden_size, num_types)
        self.urgency_head = nn.Linear(self.bert.config.hidden_size, num_urgencies)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(output.last_hidden_state[:, 0])
        return self.type_head(pooled), self.urgency_head(pooled)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CrisisClassifier(
    num_types=len(type_encoder.classes_),
    num_urgencies=len(urgency_encoder.classes_)
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

for epoch in range(30):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_type = batch["labels_type"].to(device)
        labels_urgency = batch["labels_urgency"].to(device)

        out_type, out_urgency = model(input_ids, attention_mask)
        loss_type = loss_fn(out_type, labels_type)
        loss_urgency = loss_fn(out_urgency, labels_urgency)
        loss = loss_type + loss_urgency

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} — Loss: {total_loss:.4f}")


Epoch 1 — Loss: 218.6211
Epoch 2 — Loss: 108.3425
Epoch 3 — Loss: 95.1508
Epoch 4 — Loss: 90.3366
Epoch 5 — Loss: 88.6875
Epoch 6 — Loss: 86.9046
Epoch 7 — Loss: 86.5311
Epoch 8 — Loss: 85.8152
Epoch 9 — Loss: 83.3219
Epoch 10 — Loss: 81.0290
Epoch 11 — Loss: 78.0140
Epoch 12 — Loss: 75.5215
Epoch 13 — Loss: 74.9160
Epoch 14 — Loss: 71.4621
Epoch 15 — Loss: 67.5391
Epoch 16 — Loss: 65.1883
Epoch 17 — Loss: 61.6045
Epoch 18 — Loss: 59.4062
Epoch 19 — Loss: 59.4905
Epoch 20 — Loss: 56.6347
Epoch 21 — Loss: 53.1783
Epoch 22 — Loss: 48.7062
Epoch 23 — Loss: 50.8290
Epoch 24 — Loss: 48.7917
Epoch 25 — Loss: 47.3531
Epoch 26 — Loss: 43.3067
Epoch 27 — Loss: 44.3194
Epoch 28 — Loss: 43.8195
Epoch 29 — Loss: 41.9629
Epoch 30 — Loss: 41.3309


# **FAIS similarity search**

In [18]:
!pip install -q sentence-transformers faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━

In [19]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
texts = df_synthetic["text"].tolist()

embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [21]:
import faiss
import numpy as np

dim = embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

# Store original texts for easy retrieval
crisis_texts = df_synthetic["text"].tolist()


In [22]:
def search_similar_crises(query, top_k=5):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k)

    print(f"\n🔎 Top {top_k} similar crisis reports for:\n\"{query}\"\n")
    for i in I[0]:
        print("•", crisis_texts[i])


In [25]:
search_similar_crises("Need urgent rescue in flood-hit delhi")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🔎 Top 5 similar crisis reports for:
"Need urgent rescue in flood-hit delhi"

• Urgent Rescue assistance required in Delhi.
• Rescue emergency reported in Delhi, situation is High.
• Delhi facing rescue shortage, urgency level: Medium.
• Volunteers needed for rescue support in Delhi.
• Urgent Rescue assistance required in Uttar Pradesh.


In [26]:
!pip install -q spacy geopy
!python -m spacy download en_core_web_sm


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [28]:
import spacy
from geopy.geocoders import Nominatim
import time

# Load SpaCy NER model
nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="crisis-location-ner")

# Extract first GPE from text
def extract_location(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            return ent.text
    return None

# Geocode using Nominatim
def geocode_place(place):
    try:
        loc = geolocator.geocode(place + ", India", timeout=10)
        if loc:
            return loc.latitude, loc.longitude
        else:
            return None, None
    except:
        return None, None

# Full pipeline
def enrich_crisis_report(text):
    location = extract_location(text)
    lat, lng = geocode_place(location) if location else (None, None)
    time.sleep(1)  # Respect Nominatim's 1 sec limit

    return {
        "text": text,
        "location_extracted": location,
        "latitude": lat,
        "longitude": lng
    }


In [29]:
sample_texts = [
    "Flood in Siliguri has displaced many families.",
    "Urgent medical need in Aligarh.",
    "Children hungry in a village near Bhuj after cyclone.",
    "Rescue required in Dharavi, Mumbai.",
    "Snake bite in rural Nagaland, help needed."
]

enriched = [enrich_crisis_report(text) for text in sample_texts]

import pandas as pd
df_enriched = pd.DataFrame(enriched)
df_enriched


Unnamed: 0,text,location_extracted,latitude,longitude
0,Flood in Siliguri has displaced many families.,Siliguri,26.716413,88.430992
1,Urgent medical need in Aligarh.,Aligarh,27.876107,78.135815
2,Children hungry in a village near Bhuj after c...,Bhuj,23.247245,69.668339
3,"Rescue required in Dharavi, Mumbai.",Dharavi,19.044463,72.858618
4,"Snake bite in rural Nagaland, help needed.",Nagaland,26.163056,94.588491


In [32]:
# Save model weights (after training)
torch.save(model.state_dict(), "crisis_model.pt")

**INSTRUCTIONS FOR FINE TUNING OF LLM**

In [5]:
import json
import pandas as pd

df = pd.read_json("/kaggle/input/emergency-llm-dataset/Click here to download emergency_instructions_dataset.json")
with open("instruction_dataset.jsonl", "w") as f:
    for _, row in df.iterrows():
        json.dump({"input": row["input"], "output": row["output"]}, f)
        f.write("\n")

In [6]:
def format_prompt(example):
    return {
        "text": f"<s>[INST] {example['input']} [/INST] {example['output']}</s>"
    }


In [16]:
# Install dependencies
!pip install -q transformers datasets peft accelerate bitsandbytes

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# Config
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
EPOCHS = 3
BATCH_SIZE = 4
LEARNING_RATE = 2e-4

# Load dataset
dataset = load_dataset("json", data_files="/kaggle/input/emergency-llm-dataset/Click here to download emergency_instructions_dataset.json")["train"]

# Format prompt for instruction-style models
def format_prompt(example):
    return {
        "text": f"<s>[INST] {example['input']} [/INST] {example['output']}</s>"
    }

dataset = dataset.map(format_prompt)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token  # safe padding

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

# Prep for LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Tokenize with labels (fixes missing loss issue)
def tokenize(example):
    encoding = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# Training setup
training_args = TrainingArguments(
    output_dir="./crisis_instruction_llama",
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train!
trainer.train()

# Save final model
trainer.save_model("crisis-instruction-tinyllama-lora")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
10,7.2953
20,0.6739
30,0.3249
40,0.2
50,0.1352
60,0.1065
70,0.0892
80,0.0871
90,0.0791
100,0.0751


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [22]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("crisis-instruction-tinyllama-lora", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Crisis Type: Medical, Urgency: Low, Location: Goa, Contact: +91-832-1234567"

result = pipe(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)[0]["generated_text"]
print(result)


Device set to use cuda:0


Crisis Type: Medical, Urgency: Low, Location: Goa, Contact: +91-832-1234567 [/contact] Medical situation in Goa is currently under control. Contact: +91-832-1234567.


In [25]:
# Save the final fine-tuned model + tokenizer
save_path = "CrisisCompass/llm_model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")
`

('CrisisCompass/llm_model/tokenizer_config.json',
 'CrisisCompass/llm_model/special_tokens_map.json',
 'CrisisCompass/llm_model/tokenizer.model',
 'CrisisCompass/llm_model/added_tokens.json',
 'CrisisCompass/llm_model/tokenizer.json')

In [26]:
import shutil
shutil.make_archive("crisis_llm_model", 'zip', "CrisisCompass/llm_model")

'/kaggle/working/crisis_llm_model.zip'

# Final Pipeline

In [9]:
import json
import torch
import torch.nn as nn
import spacy
from geopy.geocoders import Nominatim
from transformers import AutoTokenizer, AutoModelForCausalLM, DistilBertTokenizer, pipeline
from transformers import DistilBertModel
import os

# ========== 0. Load Custom Crisis Classifier ==========
class CrisisClassifier(nn.Module):
    def __init__(self, num_types=5, num_urgencies=3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.type_head = nn.Linear(self.bert.config.hidden_size, num_types)
        self.urgency_head = nn.Linear(self.bert.config.hidden_size, num_urgencies)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(output.last_hidden_state[:, 0])
        return self.type_head(pooled), self.urgency_head(pooled)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

classifier_model_path = "/kaggle/input/triage-basic-gen/transformers/default/1/crisis_model.pt"
classifier_model = CrisisClassifier()
classifier_model.load_state_dict(torch.load(classifier_model_path, map_location=device))
classifier_model.to(device)
classifier_model.eval()

def classify_crisis(text):
    inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    out_type, out_urgency = classifier_model(**inputs)
    type_idx = torch.argmax(out_type, dim=1).item()
    urgency_idx = torch.argmax(out_urgency, dim=1).item()
    type_labels = ["Medical", "Food", "Shelter", "Search & Rescue", "Infrastructure Damage"]
    urgency_labels = ["Low", "Medium", "High"]
    return type_labels[type_idx], urgency_labels[urgency_idx]

# ========== 1. Load Fine-tuned LLM ==========
LLM_PATH = "/kaggle/input/crisis-compass-instruction-llm"
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)
llm_model = AutoModelForCausalLM.from_pretrained(LLM_PATH)
llm_pipe = pipeline("text-generation", model=llm_model, tokenizer=llm_tokenizer, device=0 if torch.cuda.is_available() else -1)

# ========== 2. Named Entity Recognition ==========
nlp = spacy.load("en_core_web_sm")
def extract_location(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            return ent.text
    return "Unknown"

# ========== 3. Geolocation ==========
def geocode_location(location):
    try:
        loc = geolocator.geocode(location, addressdetails=True)
        if loc:
            lat = loc.latitude
            lon = loc.longitude
            full_address = loc.address
            address_dict = loc.raw.get("address", {})
            state = address_dict.get("state") or address_dict.get("region") or "Unknown"
            return lat, lon, full_address, state
    except:
        pass
    return None, None, "", "Unknown"

# ========== 4. Emergency Contact Mapping ==========
with open("/kaggle/input/emergency-contacts-per-state/emergency_contacts.json", "r") as f:
    contact_dict = json.load(f)

def get_contact(state):
    return contact_dict.get(state, "Not Available")

# ========== 5. Instruction Generation ==========
def generate_instruction(crisis_type, urgency, location, contact):
    prompt = f"<s>[INST] Crisis Type: {crisis_type}, Urgency: {urgency}, Location: {location}, Contact: {contact} [/INST]"
    result = llm_pipe(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
    return result[0]["generated_text"].split("[/INST]")[-1].strip()

# ========== 6. Full Pipeline ==========
def crisis_pipeline(report_text):
    crisis_type, urgency = classify_crisis(report_text)
    location = extract_location(report_text)
    lat, lon, full_address, state = geocode_location(location)
    contact = get_contact(state)
    instruction = generate_instruction(crisis_type, urgency, location, contact)

    return {
        "crisis_type": crisis_type,
        "urgency": urgency,
        "location": location,
        "state": state,
        "latitude": lat,
        "longitude": lon,
        "contact": contact,
        "instruction": instruction
    }

# ========== 7. Example ==========
if __name__ == "__main__":
    report = "Massive flood near Bhagalpur. People are stranded on rooftops. Immediate help needed."
    result = crisis_pipeline(report)
    print(json.dumps(result, indent=2))


Device set to use cuda:0


{
  "crisis_type": "Search & Rescue",
  "urgency": "Medium",
  "location": "Bhagalpur",
  "state": "Bihar",
  "latitude": 25.2861354,
  "longitude": 87.1304229,
  "contact": "+91-612-11223344",
  "instruction": "Search & Rescue issue in Bhagalpur. Contact: +91-612-1122334."
}
