# import library

In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score, normalized_mutual_info_score, fowlkes_mallows_score

# load csv

In [None]:
# Load a CSV file containing extracted information
# Ensure the file includes a column named "text_json" that contains structured data in JSON format
df = pd.read_csv("../data/extracted_info.csv")

# functions to convert info to numeric values

In [3]:
import re

def extract_braced_content(s):
    match = re.search(r'\{(.*?)\}', s, re.DOTALL)
    return match.group(1).strip() if match else None

def remove_characters(s):
    return s.replace('\n', '').replace(':', '').replace(' ', '').replace(',', '').replace("、", "").replace("{", "").replace("}", "")


def convert_to_mm(value):
    # mmでの表記をチェック
    mm_match = re.search(r'(\d+(\.\d+)?)mm', value)
    if mm_match:
        return int(mm_match.group(1))

    # cmでの表記をチェック
    # cm_match = re.search(r'(\d+(\.\d+)?)cm', value)
    # if cm_match:
    #     return int((cm_match.group(1)) * 10)

    # 単位がない場合の数値をチェック
    num_match = re.search(r'(\d+(\.\d+)?)', value)
    if num_match:
        return int(num_match.group(1))
    
    if ("巨大" in value) | ("大き" in value):
        return "huge"

    # 一致しない場合は元のまま返す
    return value

# execute the conversion

In [4]:
errors = 0
texts = []
for text in df["text_json"]:
    if not("}" in text):
        text += "}"
    try:
        #print(text)
        if "{" in text:
            text =  extract_braced_content(text)
        #print(text)
        text = remove_characters(text)
        text = text.split('"')
        text = [text for text in text if text]
        #print(text)
        #break
    except:
        print(text)
        errors += 1
        print(errors)
    texts.append(text)

In [5]:
infos = []
num_data = []
for text in texts:
    # try:
    # print(text)
    if not("転移" in text):
        text += ['転移', 'なし']
    if text != None:
        id_size, id_loc, id_ly, id_meta = text.index("大きさ")+1, text.index("腫瘍の場所")+1, text.index("リンパ節腫大")+1, text.index("転移")+1
        size, location, ly, meta = text[id_size], text[id_loc], text[id_ly], text[id_meta]
        cm = "cm" in size and "." not in size
        size = size.replace(".", "")
        size = convert_to_mm(size)
        if cm & (size != "huge"):
            size *= 10
        if size == "huge":
            size == 71
        lat = -1
        lob = -1
        if "左" in location:
            lat += 0 #"左"
        if "右" in location:
            lat += 1 #"右"
        if "上" in location:
            lob += 0 #"上"
        if "中" in location:
            lob += 1 #"中"
        if "下" in location:
            lob += 2 #"下"
        # if new_loc == "":
        #     lob = location
        size_num = size
        if type(size_num) != int:
            size_num = -1
        # print(size_num)
        # print(size, location, ly, meta)
        dic_info = {"size": size,
                    "size_num": size_num,
                    "location": location,
                    "lob": lob,
                    "lat": lat,
                    "ly": ly,
                    "meta": meta}
        infos.append(dic_info)
        num_ = np.array([size_num, lob, lat])
        num_data.append(num_)
        # print(dic_info)
    else:
        dic_info = {"size": -1,
                    "size_num": -1,
                    "location": -1,
                    "lob": -1,
                    "lat": -1,
                    "ly": -1,
                    "meta": -1}
        infos.append(dic_info)
        num_ = np.array([size_num, lob, lat])
        num_data.append(num_)
        #print(text)
    # except:
    #     print(text)

In [6]:
df["extracted"] = infos
infos

[{'size': 18,
  'size_num': 18,
  'location': '左上葉',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 18,
  'size_num': 18,
  'location': '肺上葉',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 18,
  'size_num': 18,
  'location': 'S1+2',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 18,
  'size_num': 18,
  'location': '左上葉S1+2',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 15,
  'size_num': 15,
  'location': '左肺上葉S1+2',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 18,
  'size_num': 18,
  'location': '左肺上葉の外側',
  'lob': -1,
  'lat': -1,
  'ly': 'null',
  'meta': 'null'},
 {'size': 10,
  'size_num': 10,
  'location': '左肺上葉',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 18,
  'size_num': 18,
  'location': '左肺上葉S1+2',
  'lob': -1,
  'lat': -1,
  'ly': 'なし',
  'meta': 'なし'},
 {'size': 18,
  'size_num': 18,
  'location': '左肺上葉S1+2',
  'lob': -1,
  'lat': -1,
  'ly': 'はない

# execute clustering & performance evaluation

In [8]:
# train test split, according to NTCIR-16
train_idx = (~df["case"].isin([4, 5, 7, 8, 10, 14, 15])).values
test_idx = (df["case"].isin([4, 5, 7, 8, 10, 14, 15])).values
train_labels = df["case"][train_idx].values
test_labels = df["case"][test_idx].values
num_data = np.array(num_data)

In [9]:
# train
targets = train_labels
train_data = num_data[train_idx]

# clustering
kmeans = KMeans(n_clusters=8, init='k-means++', random_state=0, n_init=30, max_iter=300).fit(train_data)
labels = kmeans.labels_

# evaluation
nmi = normalized_mutual_info_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
fm = fowlkes_mallows_score(targets, labels)

print("NMI:", nmi)
print("AMI:", ami)
print("FM:", fm)

NMI: 0.7152308022894814
AMI: 0.651563954939907
FM: 0.5958636987797016


In [10]:
# test
targets = test_labels
train_data = num_data[test_idx]

# clustering
kmeans = KMeans(n_clusters=8, init='k-means++', random_state=0, n_init=30, max_iter=300).fit(train_data)
labels = kmeans.labels_

# evaluation
nmi = normalized_mutual_info_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
fm = fowlkes_mallows_score(targets, labels)

print("NMI:", nmi)
print("AMI:", ami)
print("FM:", fm)

NMI: 0.6413658311220682
AMI: 0.5597516941130994
FM: 0.5354496700964052


# robustness test

In [11]:
def robust_test(targets, noisy_data):
    noise_scale = 0.05 * np.abs(noisy_data)
    noise = (2 * np.random.rand(noisy_data.shape[0], noisy_data.shape[1]) - 1 ) * noise_scale
    noisy_data = noisy_data + noise

    kmeans = KMeans(n_clusters=8, init='k-means++', random_state=0, n_init=30, max_iter=300).fit(noisy_data)
    labels = kmeans.labels_

    nmi = normalized_mutual_info_score(targets, labels)
    ami = adjusted_mutual_info_score(targets, labels)
    fm = fowlkes_mallows_score(targets, labels)
    return nmi, ami, fm

def repeat_test(targets, noisy_data, n=1000):
    nmis, amis, fms = [], [], []
    for _ in range(n):
        nmi, ami, fm = robust_test(targets, noisy_data)
        nmis.append(nmi) ; amis.append(ami) ; fms.append(fm)
    return nmis, amis, fms

train_result = repeat_test(targets=train_labels, noisy_data=num_data[train_idx], n=1000)
test_result = repeat_test(targets, noisy_data=num_data[test_idx], n=1000)

In [12]:
def calc_95ci(data):
    lower = np.percentile(data, 2.5)
    upper = np.percentile(data, 97.5)
    return (lower, upper)

def calc_each_95ci(result):
    nmi, ami, fm = result[0], result[1], result[2]
    nmi_ci = calc_95ci(nmi)
    ami_ci = calc_95ci(ami)
    fm_ci = calc_95ci(fm)
    return nmi_ci, ami_ci, fm_ci

# show the result
train_metrics = calc_each_95ci(train_result)
test_metrics = calc_each_95ci(test_result)
print("train (nmi, ami, fm):", train_metrics)
print("test (nmi, ami, fm):", test_metrics)

train (nmi, ami, fm): ((0.6758776550012524, 0.7309308755769508), (0.6006926495237799, 0.6676091287745058), (0.5412658773652742, 0.6014065304058602))
test (nmi, ami, fm): ((0.5310163601046471, 0.645449546876431), (0.4244588336018941, 0.5634156761737154), (0.4477095516335758, 0.5379459524869229))
