# Load Model

### Dependencies

In [56]:
import zipfile
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import json
import numpy as np
import pandas as pd

### Paths

In [40]:
zip_path = "yuenning_model/qwen_gemma_multilabel_final_3.zip" 
extract_path = "yuenning_model/qwen_gemma_multilabel_final_3" 
thresholds_path = "yuenning_model/multilabel_thresholds.json"

### 1. Unzip Model

In [3]:
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Model extracted to:", extract_path)
else:
    print("Model already extracted at:", extract_path)

Model already extracted at: yuenning_model/qwen_gemma_multilabel_final_3


### 2. Load Model and Tokeniser

In [50]:
model = AutoModelForSequenceClassification.from_pretrained(extract_path, num_labels=4)
tokenizer = AutoTokenizer.from_pretrained(extract_path)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen1.5-0.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 3. Load Thresholds

In [51]:
with open(thresholds_path, 'r') as f:
    thresholds = json.load(f)

label_cols = ["is_ad", "is_relevant", "is_rant", "is_legit"]
with open("yuenning_model/multilabel_thresholds.json", "r") as f:
    thresholds_dict = json.load(f)
thresholds_array = np.array([thresholds_dict[label] for label in label_cols])

### 4. Set Device

In [52]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Model loaded on", device)

Model loaded on cpu


### 5. Make Predictions

In [53]:
def predict_multilabel(texts, model, tokenizer, thresholds_array, batch_size=8, device="cuda"):
    all_probs = []
    all_preds = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            logits = model(**inputs).logits  # shape: (batch_size, num_labels)
            probs = torch.sigmoid(logits).cpu().numpy()  # shape: (batch_size, num_labels)
            preds = (probs >= thresholds_array).astype(int)
        
        all_probs.extend(probs)
        all_preds.extend(preds)
    
    return np.array(all_preds), np.array(all_probs)


In [59]:
sample_texts = ["This is an ad.", "The product is fantastic!"]
y_pred, y_probs = predict_multilabel(sample_texts, model, tokenizer, thresholds_array, device=device)
print("Predicted labels:\n", y_pred)
print("Probabilities:\n", y_probs)
pred_df = pd.DataFrame(y_pred, columns=label_cols)
probs_df = pd.DataFrame(y_probs, columns=[f"{c}_prob" for c in label_cols])
results_df = pd.concat([pred_df, probs_df], axis=1)
print(results_df)



Predicted labels:
 [[0 1 1 1]
 [0 1 1 1]]
Probabilities:
 [[0.01301506 0.92230546 0.98700804 0.2625444 ]
 [0.01769127 0.7007936  0.94883215 0.97165   ]]
   is_ad  is_relevant  is_rant  is_legit  is_ad_prob  is_relevant_prob  \
0      0            1        1         1    0.013015          0.922305   
1      0            1        1         1    0.017691          0.700794   

   is_rant_prob  is_legit_prob  
0      0.987008       0.262544  
1      0.948832       0.971650  
