In [1]:
import numpy as np
import pandas as pd
import random

# Create dummy data

### Lab results

In [2]:
results = pd.DataFrame(columns=['stay_id', 'value', 'itemid', 'charttime'])
results

Unnamed: 0,stay_id,value,itemid,charttime


In [3]:
# Choose unique IDs 500 times with replacement
results.stay_id = random.choices(range(100, 200), k=500)

# Generate random test results
results.value = np.random.uniform(low=0.0, high=100.0, size=(500,))

# Seven tests => itemid ranges from 0 to 6 
results.itemid = random.choices(range(0,6), k=500)

In [4]:
# Generate timestamps
def random_dates(start, end, n):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

start = pd.to_datetime('2015-01-05')
end = pd.to_datetime('2015-01-10')
results.charttime = random_dates(start, end, results.shape[0])

In [5]:
results.dtypes

stay_id               int64
value               float64
itemid                int64
charttime    datetime64[ns]
dtype: object

In [6]:
print(results.shape)
results.head()

(500, 4)


Unnamed: 0,stay_id,value,itemid,charttime
0,172,16.743031,5,2015-01-07 09:48:54
1,139,89.922258,0,2015-01-05 12:58:20
2,119,59.566609,5,2015-01-09 10:32:51
3,115,50.516263,3,2015-01-07 10:32:02
4,188,87.03553,0,2015-01-06 09:38:23


### Admission time and static vars

In [7]:
adm = pd.DataFrame(columns=['stay_id', 'intime', 'age', 'sex'])
adm

Unnamed: 0,stay_id,intime,age,sex


In [8]:
# Copy unique stay IDs
adm.stay_id = results.stay_id.unique()

# Genarate timestamps
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2015-01-05')
adm.intime = random_dates(start, end, adm.shape[0])

# Populate the age column
adm.age = random.choices(range(18, 90), k=adm.shape[0])

# Populate the sex column
adm.sex = random.choices(['F', 'M'], k=adm.shape[0])

In [9]:
print(adm.shape)
adm.head()

(100, 4)


Unnamed: 0,stay_id,intime,age,sex
0,172,2015-01-02 18:40:17,68,M
1,139,2015-01-02 21:41:57,69,M
2,119,2015-01-03 09:39:59,86,F
3,115,2015-01-01 12:16:03,70,F
4,188,2015-01-01 02:10:06,55,F


### Summary stats

In [10]:
results.stay_id.nunique()

100

In [11]:
results.groupby('itemid').size()

itemid
0    83
1    79
2    82
3    77
4    82
5    97
dtype: int64

In [12]:
results.groupby('stay_id').size()

stay_id
100     6
101     6
102     5
103     8
104     3
       ..
195     1
196     9
197     2
198     3
199    10
Length: 100, dtype: int64

In [13]:
results.groupby('stay_id').itemid.nunique()

stay_id
100    3
101    4
102    4
103    5
104    2
      ..
195    1
196    5
197    2
198    2
199    5
Name: itemid, Length: 100, dtype: int64

In [14]:
assert (results.groupby('stay_id').size() >= results.groupby('stay_id').itemid.nunique()).all()
(results.groupby('stay_id').size() > results.groupby('stay_id').itemid.nunique()).sum()

76

### Merge dataframes

In [15]:
final_df = results.merge(adm)

### Save data

In [16]:
results.to_csv('../data/dummy_results.csv', index=False)
adm.to_csv('../data/dummy_admissions.csv', index=False)
final_df.to_csv('../data/dummy_data_combined.csv', index=False)