In [1]:
import pandas as pd
import spacy
import numpy as np

"df" is a DataFrame that we had created during the classification to have multi-label columns. This means that for each class (abdominal, gastro, etc.), there is a column with 0s and 1s to indicate the absence or presence of that class. The file is named "merged" because I had merged implicit and explicit
Note: we used the Kinbiotics classification because it is the only one with the infection sources annotated, as the task is to predict them.

In [2]:
df = pd.read_csv('multilabel_merged.csv')

In [3]:
# load ner model to annotate symptoms
nlp_ner = spacy.load('C:\\Users\\valif\\OneDrive\\Desktop\\Medical_notes_expl\\model-best') 

In [6]:
df.head()

Unnamed: 0,text,Abdominal,Gastrointestinal,Bile duct,Urosepsis,Catheter-associated,Endocarditis,Skin and soft tissue,Bone and joints,Neurological,Respiratory,Non-identifiable,None of the above
0,Chief Complaint:\n s/p falls\n\nHistory of Pre...,0,0,0,1,0,0,0,0,0,0,0,0
1,Chief Complaint:\n febrile and neutropenia\n\n...,0,0,0,0,0,0,0,0,0,0,0,1
2,Chief Complaint:\n septic shock\n\nHistory of ...,0,0,1,0,0,0,0,0,0,0,0,0
3,Chief Complaint:\n hepatic failure\n\nHistory ...,1,0,0,0,0,0,0,0,0,0,0,0
4,Chief Complaint:\n ruq pain\n\nHistory of Pres...,0,0,1,0,0,0,0,0,0,0,0,0


In [4]:
# apply the model to get the symptoms for each text
symptoms = []

for note in df['text']:
    doc = nlp_ner(note)
    symp_set = set()

    for ent in doc.ents:
        if ent.label_ == 'Symptom':
            symp_set.add(ent.text)
    
    symptoms.append(symp_set)


In [5]:
df.insert(1, 'Symptoms', symptoms)

In [6]:
# now symptoms are in the dataset 
df.head()

Unnamed: 0,text,Symptoms,Abdominal,Gastrointestinal,Bile duct,Urosepsis,Catheter-associated,Endocarditis,Skin and soft tissue,Bone and joints,Neurological,Respiratory,Non-identifiable,None of the above
0,Chief Complaint:\n s/p falls\n\nHistory of Pre...,"{rhabdomyolysis, significant weakness, falling...",0,0,0,1,0,0,0,0,0,0,0,0
1,Chief Complaint:\n febrile and neutropenia\n\n...,"{nausea, hypertension, tachypneic, headache, n...",0,0,0,0,0,0,0,0,0,0,0,1
2,Chief Complaint:\n septic shock\n\nHistory of ...,"{ileus, hypotension, abdominal discomfort, mil...",0,0,1,0,0,0,0,0,0,0,0,0
3,Chief Complaint:\n hepatic failure\n\nHistory ...,"{nausea, ascites, hypertension, abdominal dist...",1,0,0,0,0,0,0,0,0,0,0,0
4,Chief Complaint:\n ruq pain\n\nHistory of Pres...,"{confusion, nausea, hypotensive, vomiting, ruq...",0,0,1,0,0,0,0,0,0,0,0,0


In [9]:
# uploading the dictionary containing cluster contents
import json

with open('..\\Clustering\\cluster_content.txt', 'r') as fp:
    cluster_content = json.load(fp)


In [26]:
# create the new feature, i.e the symptom counter for each cluster and text
cluster_counter = []

for _, row in df.iterrows():
    f_vector = np.zeros(len(cluster_content)) 
    for symptom in row['Symptoms']:
        for key in cluster_content.keys():
            if symptom in cluster_content[key]:
                f_vector[int(key)] += 1
    cluster_counter.append(f_vector)


In [31]:
# drop unnecessary columns
df.drop('text', axis=1, inplace=True)
df.drop('Symptoms', axis=1, inplace=True)
df.insert(0, 'cluster_counter', cluster_counter)

Here is the final dataset. Here's how to interpret it:
- Let's take the first row --> [1., 0., 2., 0., 0., 1., 0., 0., 0., 3., 0.]. This means that in the first text, there were:
    - 1 symptom from the first cluster
    - 0 symptoms from the second cluster
    - 2 symptoms from the third cluster, and so on
- The infection source associated with this medical note is only "Urosepsis."

The goal is to build a model that, using only the cluster_content feature, can classify into one or more of the possible classes. It will look like this (example with fake values):

(input)[1., 0., 2., 0., 0., 1., 0., 0., 0., 3., 0.] --> model --> (output)[0,1,0,0,0,1,1,1,0,0,0,0]

The interesting thing to understand is whether, by relying only on this feature, we can achieve better results compared to what I did by feeding the model with the entire text. In other words, we want to determine if all that extra text was actually useful or not. 
The modelling is to be found in alternative_classification.ipynb

In [35]:
df.head()

Unnamed: 0,cluster_counter,Abdominal,Gastrointestinal,Bile duct,Urosepsis,Catheter-associated,Endocarditis,Skin and soft tissue,Bone and joints,Neurological,Respiratory,Non-identifiable,None of the above
0,"[1.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0,0,0,1,0,0,0,0,0,0,0,0
1,"[0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 2.0, 0.0, 1.0, ...",0,0,0,0,0,0,0,0,0,0,0,1
2,"[2.0, 1.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0,0,1,0,0,0,0,0,0,0,0,0
3,"[0.0, 1.0, 2.0, 0.0, 3.0, 0.0, 2.0, 0.0, 0.0, ...",1,0,0,0,0,0,0,0,0,0,0,0
4,"[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...",0,0,1,0,0,0,0,0,0,0,0,0


In [38]:
#df.to_csv('df_last_task.csv', index=False)