In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

Pneumonia Consolidation Cardiomegaly Pneumothorax Atelectasis Edema
Pleural Effusion

CLASS_NAMES = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Pneumonia',
                'Pneumothorax', 'Consolidation', 'Edema']



In [2]:
generated_report_path = "./mimic_data/mimic_cxr_reports_bert_dense1.csv"
path='/media/zaheer/Data/Image_Text_Datasets/IU_Xray/latest/One_Image_norm_abnorm_split/r2gen_annotations/'

labels = pd.read_csv('./mimic_data/chexbert_labels.csv')#.drop(columns = ['Unnamed: 0'])

In [3]:
def load_data():

    with open('./mimic_data/annotation_50k.json', 'rb') as f:
        data = json.load(f)
    
    return data

def load_generated_reports():  
    
    reports = pd.read_csv(generated_report_path)
    
    return reports

def clean_data(text):
    words = nltk.word_tokenize(text)
    words = ' '.join([word.lower() for word in words if word.isalpha()])
    return words

def docTo_Mat(train, test, ngram = (1,1)):
    
    train=[clean_data(text) for text in train]
    test=[clean_data(text) for text in test]
    
    vectorizer = CountVectorizer(stop_words='english', ngram_range = ngram, min_df=1, lowercase=True)
    
    train = vectorizer.fit_transform(train)
    test = vectorizer.transform(test)
    print(train.shape, test.shape)

    return train.toarray(), test.toarray()

def DCS(ground_truth, mlb_predicted):
    
    scores = []
    
    for index, predict in mlb_predicted.iterrows():
        
        gt = ground_truth[index, :]
        #if (sum(gt) != 0 and sum(predict) != 0):
        scores.append(f1_score(gt, predict, zero_division = 0))
        
    print(np.mean(scores))
    
def multi_label_prediction(train_reports, test_reports, test = 'report', ngram = None):
    
    X_train = train_reports['report'] 
    X_test = test_reports[test]
    y_train = train_reports.iloc[:,2:].to_numpy()
    y_test = test_reports.iloc[:,3:].to_numpy()
    
    
    classes = train_reports.iloc[:,2:].columns
    
    X_train, X_test = docTo_Mat(X_train, X_test, ngram)
    
    mlb_predicted = pd.DataFrame([])
    mlb_prob = pd.DataFrame([])
    scores= pd.DataFrame([])
    
    NB_pipeline = Pipeline([('clf', MultinomialNB(fit_prior=True, class_prior=None))])
    
    for idx, category in enumerate(classes):
        
        NB_pipeline.fit(X_train, y_train[:,idx])
        
        prediction = NB_pipeline.predict(X_test)
        prob_prediction = NB_pipeline.predict_proba(X_test)
        
        mlb_prob[category] = prob_prediction[:,1]
        mlb_predicted[category] = prediction
        
    DCS(y_test, mlb_predicted)
    

In [4]:
data = load_data()

generated_reports = load_generated_reports()

test_reports = pd.DataFrame(data['test'])
test_reports['report_id'] = test_reports['image_path'].apply(lambda x: x[0])
test_reports.drop(columns = ['id','image_path','split','study_id','subject_id'], inplace = True)

train_reports = pd.DataFrame(data['train'])
train_reports['report_id'] = train_reports['image_path'].apply(lambda x: x[0])
train_reports.drop(columns = ['id','image_path','split','study_id','subject_id'], inplace = True)

In [5]:
test_reports['report'] = generated_reports['gts']
test_reports['generated_report'] = generated_reports['res']

In [6]:
train_reports = pd.merge(train_reports, labels, on = 'report_id', how = 'inner')
test_reports = pd.merge(test_reports, labels, on = 'report_id', how = 'inner')

In [7]:
#for i in range(1,5):
#multi_label_prediction(train_reports, test_reports, test = 'report', ngram = (1,1))

In [8]:
#for i in range(1,5):
multi_label_prediction(train_reports, test_reports, test = 'generated_report', ngram = (1,1))

(270790, 9122) (3858, 9122)
0.20481446725614688
