# Screening from the raw data to get the valid data of neurosurgeon35.csv

> Since the study was conducted anonymously as an online survey, two stages of eligibility screening were conducted: 
one was conducted at the beginning of the online survey, where participants were filtered by their answers to the questions about their roles in medical practice and their medical specialty; 
The other was conducted using a post-survey screening process to filter out responses that did not meet the inclusion criteria due to random guess or lack of required expertise in neuro-oncological MRI interpretation. We did so by only including participants whose task accuracy when performing the grading task alone was above 0.55. The accuracy threshold was set to be slightly higher than the random guess accuracy of 0.5. 

The code runs for one time to process and save the data. 

In [24]:
import pandas as pd
import numpy as np
from sklearn import metrics


def load_data():
    # load data
    # gt label
    xai25 = pd.read_csv('../computational_data/xai25.csv')
    # sort the dataID according to its appearance sequence in the survey question
    xai25 = xai25.sort_values(by ='survey_number')
    xai25 = xai25.set_index(keys='dataID').reset_index(drop = False)

    # collctor id:
    collector_dict = {270227783.0: 'CNSF', 270232345.0: 'email', 270238910.0:'resident', 270239183.0:'staff'}
    label_dict = {'Grade 4 glioblastoma': 1, 'Grade 2/3 glioma': 0, 'Grade 2 or Grade 3 glioma':0}
    # read data
    raw_data = pd.read_csv('../neurosurgeon35_data/raw_data.csv',dtype={'Respondent ID': object})
    data = raw_data.set_index('Respondent ID')
    data = data.replace(label_dict)
    data = data[data.index.notnull()]
    for c_id, v in data['Collector ID'].value_counts().iteritems():
        if c_id not in collector_dict:
            collector_dict[c_id] = 'unname'
    outlier_id = ['110008925317', '110008921874'] # from CNSF, random guess for AI and XAI acc, remove

    # filter out responses that has lower accuracy <0.55
    gt_df = xai25.copy()
    df = data.copy()
    gt = gt_df['gt']
    drAlone = df.filter(regex='What grade of glioma would you predict')
    dr_acc = dict()
    dr_valid_answer = dict()
    valid_id = []
    for row in drAlone.itertuples():
        answer = pd.DataFrame(row).values.flatten().tolist()[1:]
        pred_valid = []
        gt_valid = []
        for idx, a in enumerate(answer):
            if not np.isnan(a):
                pred_valid.append(a)
                gt_valid.append(gt[idx])
        acc = metrics.accuracy_score(gt_valid, pred_valid)
        if acc>0.55 and row.Index not in outlier_id: #two CNSF are outliers
            valid_id.append(row.Index)
            dr_valid_answer[row.Index] = len(pred_valid)
            print('{:.2f} | {} | {}|{}  '.format( acc,len(pred_valid), df.loc[row.Index,'Which medical specialty do you work in?'],df.loc[row.Index,'Are you a']))
        dr_acc[row.Index] = acc
    print("number of valide response", len(valid_id))
    valid_df =df.loc[ valid_id , : ]

    return valid_df, xai25

In [25]:
valid_df,xai25 = load_data()

0.80 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.80 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.84 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.82 | 17 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.88 | 25 | Neurosurgery (Adult or Pediatrics)|Attending Physician  
0.76 | 25 | Neurosurgery (Adult or Pediatrics)|Attending Physician  
1.00 | 2 | Neurosurgery (Adult or Pediatrics)|Attending Physician  
0.68 | 25 | Neurosurgery (Adult or Pediatrics)|Attending Physician  
0.80 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.84 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.88 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
1.00 | 3 | Neurosurgery (Adult or Pediatrics)|Fellow Physician  
0.60 | 25 | Neurosurgery (Adult or Pediatrics)|Resident Physician  
0.88 | 25 | Neurosurgery (Adult or Pediatrics)|Attending Physician  
0.67 | 3 | Neurosurgery (Adult or Pediatrics)|A

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dty

In [23]:
valid_df.to_csv('../neurosurgeon35_data/neurosurgeon35.csv')

In [26]:
xai25.to_csv('../computational_data/xai25.csv')