In [1]:
import pandas as pd
import numpy as np
import re

In [23]:
df = pd.read_csv('data_nsclc/Lab.csv')

In [25]:
LOINC_MAPPINGS = {
    'hemoglobin': ['718-7', '20509-6'],
    'wbc': ['26464-8', '6690-2'],
    'platelet': ['26515-7', '777-3', '778-1'],
    'creatinine': ['2160-0', '38483-4'],
    'bun': ['3094-0'],
    'sodium': ['2947-0', '2951-2'],
    'bicarbonate': ['1963-8', '1959-6', '14627-4', '1960-4', '2028-9'],
    'chloride': ['2075-0'],
    'potassium': ['6298-4', '2823-3'],
    'albumin': ['1751-7', '35706-1', '13980-8'],
    'calcium': ['17861-6', '49765-1'],
    'total_bilirubin': ['42719-5', '1975-2'],
    'ast': ['1920-8', '30239-8'],
    'alt': ['1742-6', '1743-4', '1744-2'],
    'alp': ['6768-6']
}

In [26]:
df['ResultDate'] = pd.to_datetime(df['ResultDate'])
df['TestDate'] = pd.to_datetime(df['TestDate'])

# Impute TestDate for missing ResultDate. 
df['ResultDate'] = np.where(df['ResultDate'].isna(), df['TestDate'], df['ResultDate'])

# Flatten LOINC codes 
all_loinc_codes = sum(LOINC_MAPPINGS.values(), [])

# Filter for LOINC codes 
df = df[df['LOINC'].isin(all_loinc_codes)]

# Map LOINC codes to lab names
for lab_name, loinc_codes in LOINC_MAPPINGS.items():
    mask = df['LOINC'].isin(loinc_codes)
    df.loc[mask, 'lab_name'] = lab_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[mask, 'lab_name'] = lab_name


In [29]:
df.query('lab_name == "wbc"').TestUnits.value_counts()

TestUnits
10*3/uL     1951043
10*3/mm3      63488
10*3          32181
10            30527
10*9/L        16579
U/L            4083
/mm3           1185
10*3/L          861
mm3             620
%               502
10*3/mL         283
/HPF            252
/uL             120
kU/L             59
cell/uL          34
uL               12
Name: count, dtype: int64

In [33]:
df.query('lab_name == "wbc"').query('TestUnits == "/uL"').TestResultCleaned.describe()

count    18.000000
mean      0.100139
std       0.396445
min       0.000200
25%       0.000775
50%       0.002350
75%       0.009250
max       1.688000
Name: TestResultCleaned, dtype: float64