<a href="https://colab.research.google.com/github/xiaoyufan/nbme/blob/main/data_eda_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBME Data EDA & Preprocessing

## Configurations

In [65]:
DATASET_DIR = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset'
OUTPUT_DIR = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Preprocessed'

## Packages

In [66]:
import pandas as pd

from sklearn.model_selection import train_test_split
from termcolor import colored

## Data Loading

In [67]:
train = pd.read_csv(f'{DATASET_DIR}/train.csv')
features = pd.read_csv(f'{DATASET_DIR}/features.csv')
patient_notes = pd.read_csv(f'{DATASET_DIR}/patient_notes.csv')

test = pd.read_csv(f'{DATASET_DIR}/test.csv')
sample_submission = pd.read_csv(f'{DATASET_DIR}/sample_submission.csv')

## Data Overview

### Training Data

Training data consists of three files:

- train.csv
- patient_notes.csv
- features.csv

#### train.csv

In [68]:
print('In train.csv:')
print(colored(f'Number of (rows, columns): ({train.shape[0]}, {train.shape[1]})', 'green'))
print(colored(f'Number of values: {train.count().sum()}', 'green'))
print(colored(f'Number missing values: {sum(train.isna().sum())}', 'green'))

train.head()

In train.csv:
[32mNumber of (rows, columns): (14300, 6)[0m
[32mNumber of values: 85800[0m
[32mNumber missing values: 0[0m


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']


#### patient_notes.csv

In [69]:
print('In patient_notes.csv:')
print(colored(f'Number of (rows, columns): ({patient_notes.shape[0]}, {patient_notes.shape[1]})', 'green'))
print(colored(f'Number of values: {patient_notes.count().sum()}', 'green'))
print(colored(f'Number missing values: {sum(patient_notes.isna().sum())}', 'green'))

patient_notes.head()

In patient_notes.csv:
[32mNumber of (rows, columns): (42146, 3)[0m
[32mNumber of values: 126438[0m
[32mNumber missing values: 0[0m


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


#### features.csv

In [70]:
print('In features.csv')
print(colored(f'Number of (rows, columns): ({features.shape[0]}, {features.shape[1]})', 'green'))
print(colored(f'Number of values: {features.count().sum()}', 'green'))
print(colored(f'Number missing values: {sum(features.isna().sum())}', 'green'))

features.head()

In features.csv
[32mNumber of (rows, columns): (143, 3)[0m
[32mNumber of values: 429[0m
[32mNumber missing values: 0[0m


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


### Test Data

#### test.csv

In [71]:
print('In test.csv:')
print(colored(f'Number of (rows, columns): ({test.shape[0]}, {test.shape[1]})', 'green'))
print(colored(f'Number of values: {test.count().sum()}', 'green'))
print(colored(f'Number missing values: {sum(test.isna().sum())}', 'green'))

test.head()

In test.csv:
[32mNumber of (rows, columns): (5, 4)[0m
[32mNumber of values: 20[0m
[32mNumber missing values: 0[0m


Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


### Sample Submission

#### sample_submission.csv

In [72]:
print('In sample_submission.csv:')
print(colored(f'Number of (rows, columns): ({sample_submission.shape[0]}, {sample_submission.shape[1]})', 'green'))
print(colored(f'Number of values: {sample_submission.count().sum()}', 'green'))
print(colored(f'Number missing values: {sum(sample_submission.isna().sum())}', 'green'))

sample_submission.head()

In sample_submission.csv:
[32mNumber of (rows, columns): (5, 2)[0m
[32mNumber of values: 8[0m
[32mNumber missing values: 2[0m


Unnamed: 0,id,location
0,00016_000,0 100
1,00016_001,
2,00016_002,200 250;300 400
3,00016_003,
4,00016_004,75 110


## Data Preprocessing

In [73]:
def merge_data(data, patient_notes, features):
  merged = data.merge(patient_notes, on=['case_num', 'pn_num'], how='left')
  merged = merged.merge(features, on=['case_num', 'feature_num'], how='left')
  return merged

In [74]:
train = merge_data(train, patient_notes, features)

print(colored(f'Number of (rows, columns) in merged train data: ({train.shape[0]}, {train.shape[1]})', 'green'))
print(colored(f'Number of values in merged train data: {train.count().sum()}', 'green'))
print(colored(f'Number missing values in merged train data: {sum(train.isna().sum())}', 'green'))

train.head()

[32mNumber of (rows, columns) in merged train data: (14300, 8)[0m
[32mNumber of values in merged train data: 114400[0m
[32mNumber missing values in merged train data: 0[0m


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],HPI: 17yo M presents with palpitations. Patien...,Lightheaded


In [75]:
test = merge_data(test, patient_notes, features)

print(colored(f'Number of (rows, columns) in merged test data: ({test.shape[0]}, {test.shape[1]})', 'green'))
print(colored(f'Number of values in merged test data: {test.count().sum()}', 'green'))
print(colored(f'Number missing values in merged test data: {sum(test.isna().sum())}', 'green'))

test.head()

[32mNumber of (rows, columns) in merged test data: (5, 6)[0m
[32mNumber of values in merged test data: 30[0m
[32mNumber missing values in merged test data: 0[0m


Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,feature_text
0,00016_000,0,16,0,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,HPI: 17yo M presents with palpitations. Patien...,Lightheaded


## Dataset Split

In [76]:
from sklearn.model_selection import GroupShuffleSplit

# Train-Validation split: 80-20
groups = train['pn_num'].values
gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=42)
split_gen = gss.split(train, groups=groups)
train_idx, validate_idx = next(split_gen)
train, validate = train.iloc[train_idx], train.iloc[validate_idx]

In [78]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],HPI: 17yo M presents with palpitations. Patien...,Lightheaded


In [79]:
validate.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
130,00234_000,0,234,0,['Father heart problem'],['452 458;463 476'],17 yo M complains of heart pounding since 2-3 ...,Family-history-of-MI-OR-Family-history-of-myoc...
131,00234_001,0,234,1,['Mother thyroid issues'],['425 431;436 450'],17 yo M complains of heart pounding since 2-3 ...,Family-history-of-thyroid-disorder
132,00234_002,0,234,2,[],[],17 yo M complains of heart pounding since 2-3 ...,Chest-pressure
133,00234_003,0,234,3,"['5-6 episodes', 'episode']","['61 73', '219 226']",17 yo M complains of heart pounding since 2-3 ...,Intermittent-symptoms
134,00234_004,0,234,4,['thought he would passout'],['246 270'],17 yo M complains of heart pounding since 2-3 ...,Lightheaded


### Save Preprocessed Data

In [80]:
train.to_csv(f'{OUTPUT_DIR}/train.csv')
validate.to_csv(f'{OUTPUT_DIR}/validate.csv')
test.to_csv(f'{OUTPUT_DIR}/test.csv')