In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/dsma-crisismmd/")
!pwd

/content/drive/MyDrive/dsma-crisismmd


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import random
from PIL import Image


In [None]:
data = pd.read_csv("./data_image/california_wildfires_final_data.tsv", sep="\t")


In [None]:
import re

def preprocess_tweet_for_transformer(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove mentions
    text = re.sub(r'@\w+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove RT tag
    text = re.sub('rt : ', '', text)

    return text

# Apply minimal preprocessing to the tweet_text column
data['processed_tweet_text'] = data['tweet_text'].apply(lambda x: preprocess_tweet_for_transformer(str(x)))

# Display the original and processed tweets for comparison
print(data[['tweet_text', 'processed_tweet_text']].head())


                                          tweet_text  \
0  RT @Gizmodo: Wildfires raging through Northern...   
1  PHOTOS: Deadly wildfires rage in California ht...   
2  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
3  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
4  RT @TIME: California's raging wildfires as you...   

                                processed_tweet_text  
0  wildfires raging through northern california a...  
1        photos: deadly wildfires rage in california  
2  pls share: weâ€™re capturing wildfire response...  
3  pls share: weâ€™re capturing wildfire response...  
4  california's raging wildfires as you've never ...  


In [None]:
def assign_label(row):
    if row['text_info'] == 'informative' and row['image_info'] == 'informative':
        return 'Informative'
    elif (row['text_info'] == 'informative' and row['image_info'] != 'informative') or \
         (row['text_info'] != 'informative' and row['image_info'] == 'informative'):
        return 'Ambiguous'
    else:
        return 'Uninformative'


In [None]:
data['label'] = data.apply(assign_label, axis=1)

In [None]:
# load manually annotated data
df = pd.read_csv("./data_image/manually_annotated_data")
df['is_ambiguous'] = (df['label'] == "Ambiguous").astype(int)


In [None]:
df.to_csv("cleaned_data.csv")

# Load model for zero shot test

In [None]:
!pip install transformers torchvision pillow

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load the model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
def classify_tweet(tweet_text, image_path):
    # Candidate labels (can be tuned for prompt engineering)
    candidate_labels = [
        "This tweet provides useful crisis information",
        "This tweet is not informative"
    ]

    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")

    # Prepare inputs for CLIP
    inputs = processor(
        text=candidate_labels,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    # Run model
    with torch.no_grad():
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image  # shape (1, num_labels)
        probs = logits_per_image.softmax(dim=1).squeeze()

    # Print results
    for label, prob in zip(candidate_labels, probs):
        print(f"{label}: {prob:.4f}")

    predicted = candidate_labels[probs.argmax()]
    print(f"\nPredicted: {predicted}")
    return predicted


In [None]:
df = pd.read_csv("cleaned_data.csv", index_col=0)

In [None]:
df

Unnamed: 0,tweet_id,image_id,text_info,image_info,tweet_text,image_path,processed_tweet_text,label,manual_label,is_ambiguous
0,917791044158185473,917791044158185473_0,informative,informative,RT @Gizmodo: Wildfires raging through Northern...,data_image/california_wildfires/10_10_2017/917...,wildfires raging through northern california a...,Informative,informative,0
1,917791130590183424,917791130590183424_0,informative,informative,PHOTOS: Deadly wildfires rage in California ht...,data_image/california_wildfires/10_10_2017/917...,photos: deadly wildfires rage in california,Informative,informative,0
2,917791291823591425,917791291823591425_0,informative,informative,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,data_image/california_wildfires/10_10_2017/917...,pls share: weâ€™re capturing wildfire response...,Informative,informative,0
3,917791291823591425,917791291823591425_1,informative,not_informative,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,data_image/california_wildfires/10_10_2017/917...,pls share: weâ€™re capturing wildfire response...,Ambiguous,informative,1
4,917792092100988929,917792092100988929_0,informative,informative,RT @TIME: California's raging wildfires as you...,data_image/california_wildfires/10_10_2017/917...,california's raging wildfires as you've never ...,Informative,informative,0
...,...,...,...,...,...,...,...,...,...,...
1584,923761170967797761,923761170967797761_0,informative,informative,GOP targets environmental rules after Californ...,data_image/california_wildfires/27_10_2017/923...,gop targets environmental rules after californ...,Informative,informative,0
1585,923796193670336512,923796193670336512_0,informative,not_informative,Motorcycle crash sparked 350-acre fire in Clev...,data_image/california_wildfires/27_10_2017/923...,motorcycle crash sparked 350-acre fire in clev...,Ambiguous,informative,1
1586,923821955568013313,923821955568013313_0,not_informative,informative,Carlâ€™s Jr. #SantaRosa catches #Fire while ma...,data_image/california_wildfires/27_10_2017/923...,carlâ€™s jr. #santarosa catches #fire while ma...,Ambiguous,informative,1
1587,923844053426348032,923844053426348032_0,informative,not_informative,Inside the List: The Romance Writer Who Almost...,data_image/california_wildfires/27_10_2017/923...,inside the list: the romance writer who almost...,Ambiguous,informative,1


In [None]:
ambi_df = df[df['is_ambiguous'] == 1]

In [None]:
# tweet_text and
# image_path
# manual_label

first_row = df.iloc[0]
tweet_text = first_row['processed_tweet_text']
image_path = first_row['image_path']
manual_label = first_row['label'] # Assuming 'label' column contains the manual label
print(f"Tweet Text: {tweet_text}")
print(f"Image Path: {image_path}")
print(f"Manual Label: {manual_label}")

# Run classification
classify_tweet(tweet_text, image_path)


Tweet Text: wildfires raging through northern california are terrifying
Image Path: data_image/california_wildfires/10_10_2017/917791044158185473_0.jpg
Manual Label: Informative
This tweet provides useful crisis information: 0.9060
This tweet is not informative: 0.0940

Predicted: This tweet provides useful crisis information


'This tweet provides useful crisis information'

In [None]:
# tweet_text and
# image_path
# manual_label

first_row = ambi_df.iloc[0]
tweet_text = first_row['processed_tweet_text']
image_path = first_row['image_path']
first_label = first_row['label']
manual_label = first_row['manual_label'] # Assuming 'label' column contains the manual label
print(f"Tweet Text: {tweet_text}")
print(f"Image Path: {image_path}")
print(f"First Label: {first_label}")
print(f"Manual Label: {manual_label}")

# Run classification
classify_tweet(tweet_text, image_path)

Tweet Text: pls share: weâ€™re capturing wildfire response, recovery info here:
Image Path: data_image/california_wildfires/10_10_2017/917791291823591425_1.jpg
First Label: Ambiguous
Manual Label: informative
This tweet provides useful crisis information: 0.9650
This tweet is not informative: 0.0350

Predicted: This tweet provides useful crisis information


'This tweet provides useful crisis information'

# Batch inference

In [None]:
# Candidate labels for zero-shot classification
candidate_labels = [
        "This tweet provides useful crisis information",
        "This tweet is not informative"
    ]

In [None]:
def classify_row(tweet_text, image_path, model, processor, candidate_labels):
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Failed to open image at {image_path}: {e}")
        return None, [0.0, 0.0]

    inputs = processor(
        text=candidate_labels,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits_per_image
        probs = logits.softmax(dim=1).squeeze().cpu().tolist()

    predicted_idx = int(torch.tensor(probs).argmax())
    predicted_label = candidate_labels[predicted_idx]
    return predicted_label, probs


# All data

In [None]:
# Add new columns to store predictions
df['clip_predicted_label'] = ''
df['clip_confidence_informative'] = 0.0
df['clip_confidence_not_informative'] = 0.0

for idx, row in df.iterrows():
    tweet_text = row['processed_tweet_text']
    image_path = row['image_path']

    predicted_label, probs = classify_row(tweet_text, image_path, model, processor, candidate_labels)

    df.at[idx, 'clip_predicted_label'] = predicted_label
    df.at[idx, 'clip_confidence_informative'] = probs[0]
    df.at[idx, 'clip_confidence_not_informative'] = probs[1]

    if idx % 50 == 0:
        print(f"Processed {idx} rows")


Processed 0 rows
Processed 50 rows
Processed 100 rows
Processed 150 rows
Processed 200 rows
Processed 250 rows
Processed 300 rows
Processed 350 rows
Processed 400 rows
Processed 450 rows
Processed 500 rows
Processed 550 rows
Processed 600 rows
Processed 650 rows
Processed 700 rows
Processed 750 rows
Processed 800 rows
Processed 850 rows
Processed 900 rows
Processed 950 rows
Processed 1000 rows
Processed 1050 rows
Processed 1100 rows
Processed 1150 rows
Processed 1200 rows
Processed 1250 rows
Processed 1300 rows
Processed 1350 rows
Processed 1400 rows
Processed 1450 rows
Processed 1500 rows
Processed 1550 rows


In [None]:
def simplify(label):
    return "informative" if "useful" in label.lower() else "not_informative"

df['clip_predicted_clean'] = df['clip_predicted_label'].apply(simplify)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Convert ground truth and predicted labels to match
y_true = df['manual_label'].apply(lambda x: x.lower().strip())
y_pred = df['clip_predicted_clean'].apply(lambda x: x.lower().strip())

# Print classification report
print("📊 Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=["informative", "not informative"]))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=["informative", "not informative"])
print("🧮 Confusion Matrix:\n")
print(cm)


📊 Classification Report:

                 precision    recall  f1-score   support

    informative       0.89      0.85      0.87      1303
not informative       0.43      0.50      0.46       286

       accuracy                           0.79      1589
      macro avg       0.66      0.68      0.67      1589
   weighted avg       0.80      0.79      0.80      1589

🧮 Confusion Matrix:

[[1112    0]
 [   0    0]]


In [None]:
y_true.value_counts()

Unnamed: 0_level_0,count
manual_label,Unnamed: 1_level_1
informative,1303
not_informative,286


In [None]:
y_pred.value_counts()

Unnamed: 0_level_0,count
clip_predicted_clean,Unnamed: 1_level_1
informative,1254
not informative,335


# Batch inference for ambiguous only

In [None]:
# Add new columns to store predictions
ambi_df['clip_predicted_label'] = ''
ambi_df['clip_confidence_informative'] = 0.0
ambi_df['clip_confidence_not_informative'] = 0.0

for idx, row in ambi_df.iterrows():
    tweet_text = row['processed_tweet_text']
    image_path = row['image_path']

    predicted_label, probs = classify_row(tweet_text, image_path, model, processor, candidate_labels)

    ambi_df.at[idx, 'clip_predicted_label'] = predicted_label
    ambi_df.at[idx, 'clip_confidence_informative'] = probs[0]
    ambi_df.at[idx, 'clip_confidence_not_informative'] = probs[1]

    if idx % 50 == 0:
        print(f"Processed {idx} rows")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ambi_df['clip_predicted_label'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ambi_df['clip_confidence_informative'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ambi_df['clip_confidence_not_informative'] = 0.0


Processed 100 rows
Processed 500 rows
Processed 650 rows
Processed 850 rows
Processed 1000 rows
Processed 1200 rows
Processed 1250 rows
Processed 1350 rows
Processed 1550 rows


In [None]:
ambi_df

Unnamed: 0,tweet_id,image_id,text_info,image_info,tweet_text,image_path,processed_tweet_text,label,manual_label,is_ambiguous,clip_predicted_label,clip_confidence_informative,clip_confidence_not_informative
3,917791291823591425,917791291823591425_1,informative,not_informative,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,data_image/california_wildfires/10_10_2017/917...,pls share: weâ€™re capturing wildfire response...,Ambiguous,informative,1,This tweet provides useful crisis information,0.964986,0.035014
10,917793158251077632,917793158251077632_0,informative,not_informative,RT @FoxNews: Southern California fire shrouds ...,data_image/california_wildfires/10_10_2017/917...,southern california fire shrouds disneyland an...,Ambiguous,informative,1,This tweet provides useful crisis information,0.661033,0.338967
31,917804481970102272,917804481970102272_0,informative,not_informative,RT @GrantJKidney: Divine judgement? Deadly Cal...,data_image/california_wildfires/10_10_2017/917...,divine judgement? deadly california wildfires ...,Ambiguous,informative,1,This tweet provides useful crisis information,0.663265,0.336735
32,917804966823129093,917804966823129093_0,informative,not_informative,NASA satellites capture breadth of Northern Ca...,data_image/california_wildfires/10_10_2017/917...,nasa satellites capture breadth of northern ca...,Ambiguous,informative,1,This tweet provides useful crisis information,0.793848,0.206152
34,917805494504443904,917805494504443904_0,informative,not_informative,Major disaster declaration approved for the st...,data_image/california_wildfires/10_10_2017/917...,major disaster declaration approved for the st...,Ambiguous,informative,1,This tweet provides useful crisis information,0.920794,0.079206
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,923725791086526466,923725791086526466_0,informative,not_informative,Go to https://t.co/MDFk59X16v and grab a shirt...,data_image/california_wildfires/27_10_2017/923...,go to and grab a shirt to support people affec...,Ambiguous,informative,1,This tweet is not informative,0.255476,0.744524
1579,923725791086526466,923725791086526466_1,informative,not_informative,Go to https://t.co/MDFk59X16v and grab a shirt...,data_image/california_wildfires/27_10_2017/923...,go to and grab a shirt to support people affec...,Ambiguous,informative,1,This tweet provides useful crisis information,0.518882,0.481118
1585,923796193670336512,923796193670336512_0,informative,not_informative,Motorcycle crash sparked 350-acre fire in Clev...,data_image/california_wildfires/27_10_2017/923...,motorcycle crash sparked 350-acre fire in clev...,Ambiguous,informative,1,This tweet provides useful crisis information,0.823565,0.176435
1586,923821955568013313,923821955568013313_0,not_informative,informative,Carlâ€™s Jr. #SantaRosa catches #Fire while ma...,data_image/california_wildfires/27_10_2017/923...,carlâ€™s jr. #santarosa catches #fire while ma...,Ambiguous,informative,1,This tweet provides useful crisis information,0.710866,0.289134


In [None]:
ambi_df['clip_predicted_label'].value_counts()

Unnamed: 0_level_0,count
clip_predicted_label,Unnamed: 1_level_1
This tweet provides useful crisis information,229
This tweet is not informative,155


In [None]:
def simplify(label):
    return "informative" if "useful" in label.lower() else "not_informative"

ambi_df['clip_predicted_clean'] = ambi_df['clip_predicted_label'].apply(simplify)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ambi_df['clip_predicted_clean'] = ambi_df['clip_predicted_label'].apply(simplify)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Convert ground truth and predicted labels to match
y_true = ambi_df['manual_label'].apply(lambda x: x.lower().strip())
y_pred = ambi_df['clip_predicted_clean'].apply(lambda x: x.lower().strip())

# Print classification report
print("📊 Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=["informative", "not informative"]))


📊 Classification Report:

                 precision    recall  f1-score   support

    informative       0.99      0.59      0.74       380
not informative       0.01      0.25      0.01         4

       accuracy                           0.59       384
      macro avg       0.50      0.42      0.38       384
   weighted avg       0.98      0.59      0.73       384



In [None]:
cm = confusion_matrix(y_true, y_pred, labels=["informative", "not_informative"])
print("🧮 Confusion Matrix:\n")
print(cm)


🧮 Confusion Matrix:

[[226 154]
 [  3   1]]
