In [14]:
import os

os.listdir("dataset/data/CADEC.v2/cadec/meddra")


['LIPITOR.95.ann',
 'LIPITOR.969.ann',
 'LIPITOR.81.ann',
 'LIPITOR.941.ann',
 'LIPITOR.799.ann',
 'LIPITOR.955.ann',
 'LIPITOR.1000.ann',
 'LIPITOR.56.ann',
 'LIPITOR.766.ann',
 'LIPITOR.772.ann',
 'LIPITOR.42.ann',
 'LIPITOR.982.ann',
 'LIPITOR.996.ann',
 'LIPITOR.564.ann',
 'LIPITOR.202.ann',
 'LIPITOR.216.ann',
 'LIPITOR.570.ann',
 'VOLTAREN.19.ann',
 'VOLTAREN.31.ann',
 'LIPITOR.558.ann',
 'ARTHROTEC.5.ann',
 'VOLTAREN.25.ann',
 'LIPITOR.389.ann',
 'LIPITOR.410.ann',
 'ARTHROTEC.38.ann',
 'LIPITOR.376.ann',
 'LIPITOR.362.ann',
 'LIPITOR.404.ann',
 'LIPITOR.438.ann',
 'ARTHROTEC.10.ann',
 'LIPITOR.809.ann',
 'DICLOFENAC-POTASSIUM.1.ann',
 'LIPITOR.835.ann',
 'LIPITOR.821.ann',
 'LIPITOR.174.ann',
 'LIPITOR.612.ann',
 'LIPITOR.606.ann',
 'LIPITOR.160.ann',
 'LIPITOR.148.ann',
 'CATAFLAM.1.ann',
 'LIPITOR.149.ann',
 'LIPITOR.607.ann',
 'LIPITOR.161.ann',
 'LIPITOR.175.ann',
 'LIPITOR.613.ann',
 'LIPITOR.820.ann',
 'LIPITOR.834.ann',
 'LIPITOR.808.ann',
 'ARTHROTEC.11.ann',
 'LIPITOR.

In [15]:
pt_dict = extract_pt_labels("dataset/data/CADEC.v2/cadec/meddra")

len(pt_dict)


3398

In [16]:
for k, v in list(pt_dict.items())[:5]:
    print(k, "→", v)


leg cramps → 10011301
cramps → 10028294
severe pain in my calf muscles → 10033371
pain was too severe → 10033371
could not even walk in the morning → 10047810


MedDRA PT Dictionary Extraction

Purpose: Hierarchy-aware evaluation

In [9]:
def extract_pt_labels(path):
    pt_map = {}
    for file in os.listdir(path):
        if file.endswith(".ann"):
            with open(os.path.join(path, file), encoding="utf-8") as f:
                for line in f:
                    if line.startswith("T"):
                        parts = line.strip().split("\t")
                        if len(parts) == 3:
                            _, meta, text = parts
                            pt = meta.split()[0]
                            pt_map[text.lower()] = pt
    return pt_map

pt_dict = extract_pt_labels("dataset/data/CADEC.v2/cadec/meddra")


In [11]:
pt_dict = extract_pt_labels("dataset/data/CADEC.v2/cadec/meddra")

len(pt_dict)

3398

In [20]:
rows = []

for file in os.listdir("dataset/data/CADEC.v2/cadec/meddra"):
    if file.endswith(".ann"):
        doc_id = file.replace(".ann", "")
        with open(os.path.join("dataset/data/CADEC.v2/cadec/meddra", file), encoding="utf-8") as f:
            for line in f:
                if line.startswith("T"):
                    parts = line.strip().split("\t")
                    if len(parts) == 3:
                        _, meta, text = parts
                        label = meta.split()[0]
                        rows.append([doc_id, label, text])


In [21]:
len(rows)



6315

In [24]:
import pandas as pd

df = pd.DataFrame(rows, columns=["doc_id", "label", "adr_text"])

df.head()


Unnamed: 0,doc_id,label,adr_text
0,LIPITOR.95,10011301,leg cramps
1,LIPITOR.95,10028294,cramps
2,LIPITOR.969,10033371,Severe pain in my calf muscles
3,LIPITOR.969,10033371,pain was too severe
4,LIPITOR.969,10047810,could not even walk in the morning


In [25]:
df.to_csv("cadec_adr.csv", index=False)
print("Saved successfully")


Saved successfully


In [27]:
df.to_csv("cadec_adr.csv", index=False)


In [28]:
import pandas as pd

df = pd.read_csv("cadec_adr.csv")

print(df.shape)
df.head()


(6315, 3)


Unnamed: 0,doc_id,label,adr_text
0,LIPITOR.95,10011301,leg cramps
1,LIPITOR.95,10028294,cramps
2,LIPITOR.969,10033371,Severe pain in my calf muscles
3,LIPITOR.969,10033371,pain was too severe
4,LIPITOR.969,10047810,could not even walk in the morning


In [29]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

print("Train:", train_df.shape)
print("Test:", test_df.shape)


Train: (5052, 3)
Test: (1263, 3)


In [30]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


model = SentenceTransformer("all-MiniLM-L6-v2")

train_emb = model.encode(train_df["adr_text"].tolist(), show_progress_bar=True)
test_emb  = model.encode(test_df["adr_text"].tolist(), show_progress_bar=True)

sim_bert = cosine_similarity(test_emb, train_emb)


Batches:   0%|          | 0/158 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

  ret = a @ b
  ret = a @ b
  ret = a @ b


Accuracy@5 & Hierarchy-Aware Accuracy

Purpose: Research-grade evaluation

In [5]:
import numpy as np
def hierarchy_accuracy_at_k(sim, train_df, test_df, k=5):
    topk_idx = np.argsort(sim, axis=1)[:, -k:]
    
    exact, partial = 0, 0
    for i in range(len(test_df)):
        true = test_df.iloc[i]["adr_text"].lower()
        preds = train_df.iloc[topk_idx[i]]["adr_text"].str.lower().values
        
        if true in preds:
            exact += 1
        elif pt_dict.get(true) in [pt_dict.get(p) for p in preds]:
            partial += 1
    
    return exact/len(test_df), (exact+partial)/len(test_df)
    

Final Results

Purpose: Numbers for paper

In [40]:
import numpy as np
exact_acc, hierarchy_acc = hierarchy_accuracy_at_k(
    sim_bert, train_df, test_df, k=5
)

print("Exact A@5:", exact_acc)
print("Hierarchy-aware A@5:", hierarchy_acc)


Exact A@5: 0.5415676959619953
Hierarchy-aware A@5: 0.8701504354711006
