Import libraries

In [1]:
import pandas as pd
import pathlib
from collections import OrderedDict
from debater_python_api.api.debater_api import DebaterApi

Load dataset as dataframe

In [2]:
df = pd.read_csv('../data/mtsamples_descriptions_clean.csv')

Remove rows with null values

In [3]:
df = df[~(df.isna().sum(axis=1) > 0)]

Convert identifiers to string type

In [4]:
df['id'] = df['id'].astype('str')
df['id_description'] = df['id_description'].astype('str')

Store the dataframe as an Ordered Dictionary (for subsequent use with IBM Debater API)

In [5]:
sentences = df.to_dict(orient="records", into=OrderedDict)

Load IBM Debater API key as a variable

In [6]:
apikey_path = pathlib.Path('../APIkey.txt')
api_key = apikey_path.read_text().strip()

Initialize Debater API services

In [7]:
debater_api = DebaterApi(apikey=api_key)
arg_quality_client = debater_api.get_argument_quality_client()
keypoints_client = debater_api.get_keypoints_client()

Set a topic, then use the Argument Quality service to select the top 1000 sentences from the dataset that are most closely related to the topic

In [8]:
# Random topic
topic = "The patient is a 30-year-old who was admitted with symptoms including obstructions, failures, and pain that started four days ago."
# Create list of dictionaries for Argument Quality API call
sentences_topic = [{ "sentence": sentence["text"], "topic": topic} for sentence in sentences]
# Store results in scores
scores = arg_quality_client.run(sentences_topic)
# Sort sentences in descending order based on score
sentences_sorted = [s for s, _ in sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)]
# Store top 1000 sentences in separate variable
top_k = 1000
sentences_top_1000_aq = sentences_sorted[:top_k]

ArgumentQualityClient: 100%|██████████| 3245/3245 [00:56<00:00, 56.97it/s]

ArgumentQualityClient: 100%|██████████| 3245/3245 [01:10<00:00, 56.97it/s]

Configure parameters for Key Point Analysis service and prepare data in list form

In [9]:
# Specify domain for KPA
domain = "medical_demo"
# Configure parameters
run_params = {"mapping_threshold": 0.95, "n_top_kps": 20}
# Store all text in a list
sentences_texts = [sentence["text"] for sentence in sentences_top_1000_aq]
# Store all id in a list
sentences_ids = [sentence["id"] for sentence in sentences_top_1000_aq]

Clear the domain first prior to uploading data to the Key Point Analysis service

In [10]:
# Clear domain on KPA service
try:
    keypoints_client.delete_domain_cannot_be_undone(domain)
except Exception:
    pass
# Upload data to KPA service
keypoints_client.upload_comments(domain=domain, comments_ids=sentences_ids, comments_texts=sentences_texts, dont_split=True)
keypoints_client.wait_till_all_comments_are_processed(domain=domain)

ERROR:root:There is a problem with the request (422): user: 8c251e doesn't have domain: medical_demo


Run the KPA job and get results

In [11]:
future = keypoints_client.start_kp_analysis_job(domain=domain, comments_ids=sentences_ids, run_params=run_params)
kpa_result = future.get_result(high_verbosity=False, polling_timout_secs=5)
future.get_job_id()

'63a2a582d2ae1427f1466607'

Inspect one of the KPA results

In [22]:
print(kpa_result["keypoint_matchings"][1]["matching"][0])

{'domain': 'medical_demo', 'comment_id': '2487', 'sentence_id': 0, 'sents_in_comment': 1, 'span_start': 0, 'span_end': 21, 'num_tokens': 3, 'argument_quality': 0.43169254064559937, 'sentence_text': 'Worrisome skin lesion', 'score': 1.0, 'kp_quality': 0.994644045829773}


Store the KPA results in a dataframe

In [23]:
# Initialize a list for storing the keypoints
matchings_rows = []
# Loop through all keypoint matchings and store in the list
for keypoint_matching in kpa_result["keypoint_matchings"]:
    kp = keypoint_matching["keypoint"]
    for match in keypoint_matching["matching"]:
        match_row = [
            kp,
            match["sentence_text"],
            match["score"],
            match["comment_id"],
            match["sentence_id"],
            match["sents_in_comment"],
            match["span_start"],
            match["span_end"],
            match["num_tokens"],
            match["argument_quality"],
            match["kp_quality"]
        ]
        matchings_rows.append(match_row)
# Define column headers for dataframe
cols = [
    "kp",
    "sentence_text",
    "match_score",
    "comment_id",
    "sentence_id",
    "sents_in_comment",
    "span_start",
    "span_end",
    "num_tokens",
    "argument_quality",
    "keypoint_quality"
]
# Store the list as dataframe
df_match = pd.DataFrame(matchings_rows, columns=cols)

View sample of match dataframe

In [24]:
df_match.tail()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality,keypoint_quality
936,Spontaneous vaginal delivery,Delivery was via spontaneous vaginal delivery,0.999719,323,0,1,0,45,6,0.433695,0.001563
937,Spontaneous vaginal delivery,The patient progressed to a normal spontaneous...,0.99971,322,0,1,0,88,13,0.623576,0.0
938,Spontaneous vaginal delivery,She was admitted here and labor was confirmed ...,0.96967,336,0,1,0,72,12,0.488468,0.000391
939,Obsessive compulsive disorder.,Obsessive compulsive disorder.,1.0,3002,0,1,0,30,3,0.547027,0.998403
940,Obsessive compulsive disorder.,Acute episode of agitation,0.954987,627,0,1,0,26,4,0.428283,0.940409


Merge match dataframe with original dataframe on identifiers and save resulting dataframe as CSV

In [25]:
df_merge = pd.merge(left=df_match, right=df[["id", "id_description", "medical_specialty_new"]], left_on = "comment_id", right_on = "id", validate = "one_to_one")
df_merge.to_csv("../data/df_merge.csv", index=False)