In [None]:
# Download weights of AIMSDistill model (Drive link provided by the organisers on their GitHub page:
# https://github.com/mila-ai4h/ai4h_aims-au/tree/main/AIMSDistill)

!gdown 12DXXgi5rNRf8r8EKnjmzwR6GnNVTc1WI

Downloading...
From (original): https://drive.google.com/uc?id=12DXXgi5rNRf8r8EKnjmzwR6GnNVTc1WI
From (redirected): https://drive.google.com/uc?id=12DXXgi5rNRf8r8EKnjmzwR6GnNVTc1WI&confirm=t&uuid=8dcdc481-6ca4-4674-a967-0b465a043418
To: /content/AIMSDistill.pth
100% 1.58G/1.58G [00:24<00:00, 64.6MB/s]


#### **1. Getting the model ready**

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

class AimsDistillModel(nn.Module):
    def __init__(self,tokenizer, model_name,dropout):
        super(AimsDistillModel, self).__init__()
        model = AutoModel.from_pretrained(model_name,trust_remote_code=True)
        for param in model.parameters():
            param.requires_grad = True
        self.m = nn.Dropout(p=dropout)
        self.bert = model
        embedding_dim = model.config.hidden_size

        self.classifier = nn.Linear(model.config.hidden_size, 11)  # 2 for binary classification

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.m(self.classifier(pooled_output))
        return logits

model_name = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AimsDistillModel(tokenizer, model_name,dropout=0.0).to(device)
state_dict = torch.load("AIMSDistill.pth", map_location=device)
model.load_state_dict(state_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

<All keys matched successfully>

#### **2. Dataset and DataLoader**

In [None]:
# !unzip -q /content/Kenya+sex.zip
# !unzip -q /content/Australia+cleaning.zip
# !unzip -q /content/Canada+narcotics.zip
# !unzip -q /content/Colombia+mines.zip
# !unzip -q /content/India+IT.zip
# !unzip -q /content/Nigeria+cocoa.zip

## If not done already, please download the above zip files from our drive link: https://drive.google.com/drive/folders/1kcbeI4mR__aP_zqNxyWhVYxUNhzGttl-?usp=drive_link

In [None]:
import os
import pandas as pd
stories_df=pd.DataFrame()
for f in os.listdir("/content/Kenya+Sex"):
  if f.endswith(".csv"):
    stories_df=pd.concat([stories_df,pd.read_csv(os.path.join("/content/Kenya+Sex",f))])

texts_stories = stories_df.Sentence.to_list()

from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np

# Example dataset class
class StoryDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, teacher_logits=None):
        self.texts = texts
        # self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.teacher_logits = teacher_logits
        if isinstance(self.teacher_logits, np.ndarray):
            self.teacher_logits = torch.from_numpy(self.teacher_logits)
            print(" #####  ", isinstance(self.teacher_logits, np.ndarray))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Tokenize the text and return input_ids and attention_mask
        inputs = self.tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=self.max_length)
        input_ids = inputs['input_ids'].squeeze(0)  # Remove the batch dimension
        attention_mask = inputs['attention_mask'].squeeze(0)
        # labels = torch.tensor(self.labels[idx], dtype=torch.float32)
        if self.teacher_logits is None:
            return (input_ids, attention_mask)
            # , labels, False
        else:
            teacher_logits = self.teacher_logits[idx]
            return (input_ids, attention_mask), teacher_logits
            # , labels, teacher_logits

max_length = 60
test_dataset_stories = StoryDataset(texts_stories, tokenizer, max_length)
test_loader_stories = DataLoader(test_dataset_stories, batch_size=32, shuffle=False)

#### **3. Predict**

In [None]:
from tqdm import tqdm

def get_logits(batch, model):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids, attention_mask = [b.to(device) for b in batch]  # Move to GPU
  logits = model(input_ids=input_ids, attention_mask=attention_mask)
  return logits

STORIES_pred=[]
model.eval()
with torch.no_grad():
  for batch in tqdm(test_loader_stories):
      logits = get_logits(batch, model)
      pred = (logits >= 0.9).float()
      STORIES_pred.extend(pred.cpu().numpy())

100%|██████████| 86/86 [00:32<00:00,  2.67it/s]


In [None]:
list_name = [
    "approval",
    "signature",
    "c1 (reporting entity)",
    "c2 (structure)",
    "c2 (operations)",
    "c2 (supply chains)",
    "c3 (risk description)",
    "c4 (risk mitigation)",
    "c4 (remediation)",
    "c5 (effectiveness)",
    "c6 (consultation)"
]


df_result= pd.DataFrame(columns=list_name)
df_result['Sentence'] = stories_df['Sentence']
df_result[list_name] = STORIES_pred
df_result["ss_region"] = ["Kenya+Sex"]*len(df_result)
df_result

Unnamed: 0,approval,signature,c1 (reporting entity),c2 (structure),c2 (operations),c2 (supply chains),c3 (risk description),c4 (risk mitigation),c4 (remediation),c5 (effectiveness),c6 (consultation),Sentence,ss_region
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Skip to main content\n\nEnable accessibility f...,Kenya+Sex
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edited by Oluwatoyin Olatundun Olatundun Ilesa...,Kenya+Sex
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Yet, it never occurs to our minds that this is...",Kenya+Sex
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,"Globally, children are estimated to account fo...",Kenya+Sex
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Child trafficking involves the “recruitment, s...",Kenya+Sex
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,As HAART we are dedicated to fighting against ...,Kenya+Sex
39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Thank you for supporting our mission!,Kenya+Sex
40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Survivors of human trafficking need your help ...,Kenya+Sex
41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Donate now on haartkenya.org/donate Paybill Nu...,Kenya+Sex


In [None]:
df_result[df_result['c3 (risk description)']==1]["Sentence"]

In [None]:
import google.generativeai as genai

genai.configure(api_key="YOUR-GOOGLE-API-KEY") # YOUR-GOOGLE-API-KEY
model = genai.GenerativeModel("gemini-2.5-flash")

# testing
response = model.generate_content("Explain how AI works in 50 words")
print(response.text)

In [None]:
def chunk_sentences(sentences, chunk_size=80):
    """Split a list of sentences into chunks of size chunk_size."""
    for i in range(0, len(sentences), chunk_size):
        yield sentences[i:i+chunk_size]

chunks = list(chunk_sentences(df_result[df_result['c3 (risk description)']==1]["Sentence"].to_list(), chunk_size=80))
# print(f"Number of chunks: {len(chunks)}")

def make_chunk_prompt(sentences):
    return f"""
You are given a collection of sentences that were classified by a predictive model as "risk descriptions".

Your task:
- Read all the provided sentences carefully.
- Identify recurring themes, risks, and factual issues being described.
- Summarise them in clear, objective language (in Canadian English) with information that is supported by the sentences.
- Do not use creative, poetic, or figurative wording.

Here are the sentences:
{chr(10).join(sentences)}
"""

chunk_summaries = []
for i, chunk in enumerate(chunks):
    prompt = make_chunk_prompt(chunk)
    response = model.generate_content(prompt)  # adjust depending on API wrapper
    chunk_summaries.append(response.text)

# print("Generated", len(chunk_summaries), "chunk summaries")

final_prompt = f"""
You are given a set of summaries, each created from a different batch of sentences.
Your task is to merge them into one coherent, factual summary.
- Do not be poetic or speculative; stick to the evidence in the summaries.
- If certain risks appear in multiple summaries, emphasise them.

Here are the summaries:
{chr(10).join(chunk_summaries)}
"""

final_summary = model.generate_content(final_prompt).text
print(final_summary)

