Creates dummy dataset of patient data including randomly generated location info.

In [0]:
import math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
FILENAME="data.csv"

generate dummy clinics

In [0]:
# size of the map
MAP_DIM = (1000, 1000)

In [121]:
NUM_CLINICS = int(np.sqrt(max(MAP_DIM[0], MAP_DIM[1])))
NUM_CLINICS

31

In [0]:
def generate_clinics(num=NUM_CLINICS, max_x=MAP_DIM[0], max_y=MAP_DIM[1], 
                     saveto="clinics.csv"):
    ''' 
    generates clinic location data by sampling from 2d uniform
    
    Args:
    num: number of clinics
    max_x, max_y: dimensions of map
    saveto: output csv file name
    '''
    # randomly generate clinics
    clinics = []
    for i in range(num):
        while True:
            x = random.randint(0, max_x - 1)
            y = random.randint(0, max_y - 1)
            clinic = (x, y)
            if clinic not in clinics:
                clinics.append((x,y))
                break
    assert len(clinics) == num, "generated %d clinics" %len(clinics)

    if len(saveto) > 0:
        pd.DataFrame(clinics).to_csv(saveto)
    return clinics

In [0]:
clinics = generate_clinics()

generate dummy patients

---



In [0]:
data = pd.read_csv(FILENAME)

In [125]:
data.shape

(294, 14)

In [0]:
def generate_patients(clinics = clinics, file=FILENAME, max_x=MAP_DIM[0], 
                    max_y=MAP_DIM[1], compute_dists=True, concact=True, 
                    saveto="patients.csv"):
    ''' 
    generates patient data by reading dummy data set and randomly sampling from 
    uniform for location
    
    Args:
    clinics: list of clinics as (x,y) coordinates
    file: filename of dataset
    max_x, max_y: dimensions of map
    compute_dists: if True compute distances between client and all clinics as 
        features; otherwise use (x,y) 
    concact: whether to return data as concacted data frame 
        or separately (data, sampled dists)
    saveto: output csv filename
    '''
    data = pd.read_csv(file)

    # remove '?' 
    data = data.replace("?", np.nan).astype(float)

    num_patients = data.shape[0]

    # compute dist for each clinic or just use (x,y)
    added_cols = len(clinics) if compute_dists else 2

    # dict with key representing columns in pd.df
    dist= {i:[] for i in range(added_cols)} 

    for _ in range(num_patients):
        x = random.randint(0, max_x - 1)
        y = random.randint(0, max_y - 1)

        if compute_dists:
            # compute distance to all the available clinics as features
            for i, (c_x, c_y) in enumerate(clinics):
                distance = np.linalg.norm([x-c_x, y-c_y])
                dist[i].append(distance)
        else:
            # use location coordinates as features
            dist[0] = x
            dist[1] = y

    dist = pd.DataFrame(dist)
    concacted = pd.concat([data, dist], axis=1)

    if len(saveto) > 0:
        concacted.to_csv(saveto)

    return concacted if concact else (data, dist)
    

In [0]:
patients=generate_patients()

In [128]:
patients

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,28.0,1.0,2.0,130.0,132.0,0.0,2.0,185.0,0.0,0.0,,,,0.0,226.296266,392.928747,101.242284,435.600735,506.211418,465.188134,382.582017,591.145498,569.106317,576.101554,409.441082,712.774859,453.027593,772.582682,624.679118,499.961999,656.555405,610.312215,576.933272,621.485318,547.153543,587.241007,367.662889,206.344372,340.812265,326.833291,802.257440,511.754824,531.672832,319.363429,694.398301
1,29.0,1.0,2.0,120.0,243.0,0.0,0.0,160.0,0.0,0.0,,,,0.0,809.143374,671.752931,737.634733,420.186863,314.428052,903.329951,463.838334,355.879193,778.614796,209.990476,454.300561,651.586525,719.653389,1067.601517,625.719586,184.567061,65.192024,781.282279,275.697298,686.997089,189.509894,229.200785,448.206426,844.855609,806.549441,412.704495,391.031968,136.132289,216.316897,351.573605,262.480475
2,29.0,1.0,2.0,140.0,,0.0,0.0,170.0,0.0,0.0,,,,0.0,658.502088,335.073126,727.882545,319.513693,856.308940,515.686921,336.719468,319.249119,244.460631,818.306788,318.001572,46.097722,307.546744,472.839296,69.720872,726.944977,625.645267,205.922315,398.854610,93.813645,489.121662,841.280572,359.356369,817.692485,518.713794,425.018823,425.248163,585.451962,790.977244,502.674845,457.482240
3,30.0,0.0,1.0,170.0,237.0,0.0,1.0,170.0,0.0,0.0,,,6.0,0.0,861.246771,782.051789,762.895799,550.436191,210.983412,998.098192,581.835888,516.225726,909.356916,96.462428,578.523984,811.285400,836.622974,1196.035117,777.158928,163.713164,234.036322,918.131254,435.821064,833.616219,341.825979,99.859902,562.337977,861.632172,889.811778,512.675336,559.432748,250.569352,135.484316,433.761455,432.611835
4,31.0,0.0,2.0,100.0,219.0,0.0,1.0,150.0,0.0,0.0,,,,0.0,613.142724,447.835907,568.833895,198.600101,380.326176,681.822558,239.726928,194.064422,558.605406,333.138109,230.755282,471.832597,495.665210,846.991145,427.235298,243.129595,206.903359,565.053095,130.299655,480.676606,68.942005,356.103917,224.539529,679.776434,590.234699,196.127510,350.720687,103.232747,306.806128,159.765453,224.207493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,52.0,1.0,4.0,160.0,331.0,0.0,0.0,94.0,1.0,2.5,,,,1.0,232.441390,136.014705,279.064509,268.001866,623.105930,287.083612,203.160035,436.405774,313.312943,649.559851,228.650388,475.378796,198.012626,543.382922,384.480169,554.116414,618.037216,353.318553,456.400044,370.276923,471.663015,667.683308,203.855341,371.258670,180.360750,215.390343,653.918955,490.652627,607.220718,279.692688,579.054402
290,54.0,0.0,3.0,130.0,294.0,0.0,1.0,100.0,1.0,0.0,2.0,,,1.0,167.074834,225.248751,311.708197,464.620275,815.437306,109.416635,402.031093,626.355330,324.607455,853.874113,423.624834,582.439696,236.630514,433.847900,495.358456,760.711509,830.752069,374.012032,658.246155,451.220567,682.358410,870.723837,408.396866,345.638540,34.014703,428.317639,837.785772,704.014915,810.456661,491.757054,778.607090
291,56.0,1.0,4.0,155.0,342.0,1.0,0.0,150.0,1.0,3.0,2.0,,,1.0,463.001080,484.385177,353.872859,371.381206,263.427409,647.808614,352.279718,473.887117,652.357264,323.396042,369.109740,691.045585,548.731264,909.982417,618.262080,249.098374,435.197656,680.227168,428.453031,644.040371,364.522976,335.219331,328.876877,453.001104,526.715293,262.274665,653.241915,294.294071,278.890660,188.904738,525.205674
292,58.0,0.0,2.0,180.0,393.0,0.0,0.0,110.0,1.0,1.0,2.0,,7.0,1.0,918.044117,608.172673,951.458354,455.619359,877.385320,819.873771,509.136524,322.800248,556.400036,801.140437,481.814280,268.151077,599.770790,775.085157,341.410310,731.921444,542.542164,519.849978,380.663894,402.369233,471.338520,823.293994,523.026768,1054.381809,802.286732,569.404074,226.176922,579.615390,789.496675,613.157402,343.700451


In [129]:
patients.iloc[0]['slope']

nan

Generate Hypothetical Clinic Matching for Supervised Learning

In [0]:
def matching_func(client):
    '''
    match client to closest but also factors in chol and trestbps

    TODO: make func for realistic for better dummy data
    '''
    dists=list(client)[14:]
    closest = np.argmin(dists)

    if not pd.isnull(client['chol']) and client['chol'] > 300:
        if client['sex'] == 1:
            return (closest + random.randint(0, 10)) % len(dists)
        else:
            return (closest - random.randint(0, 10)) % len(dists)
    elif not pd.isnull(client['trestbps']) and client['trestbps'] > 155:
        if client['sex'] == 1:
            return (closest + random.randint(0, 5)) % len(dists)
        else:
            return (closest - random.randint(0, 5)) % len(dists)
    return closest
    

def match(patients, saveto="matchings.csv"):
    ''' 
    generates dummy patient data for where the patient went 
    for treatment. output used for supervised learning tasks.
    
    Args:
    patients: pd dataframe row containing client data
    saveto: output csv filename
    '''

    matched_clinics = [0 for _ in range(patients.shape[0])]
    for index, c in patients.iterrows():
        matched_clinics[index] = matching_func(c)

    dataframe = pd.DataFrame({"MatchedClinic" : matched_clinics})
    
    if len(saveto) > 0:
        dataframe.to_csv(saveto)
    return dataframe

In [131]:
match(patients)

Unnamed: 0,MatchedClinic
0,2
1,16
2,11
3,9
4,20
...,...
289,9
290,24
291,30
292,20
