In [7]:
import pandas as pd
import numpy as np 
import random as rd

In [18]:
D = pd.read_json (r'diseases_db.json')
# put the json into the same direct where this file lies 

In [19]:
D
# D is the Pandas Dataframe 

Unnamed: 0,name,id,symptoms
0,Cold,80,"[{'id': 28, 'probability': 0.7058823390000001}..."
1,Flu,11,"[{'id': 54, 'probability': 0.54}, {'id': 29, '..."
2,Hay fever,367,"[{'id': 28, 'probability': 0.8823529000000001}..."
3,Flu-related rhinitis,368,"[{'id': 11, 'probability': -1.0}, {'id': 15, '..."
4,State of stress,606,"[{'id': 54, 'probability': 0.9}, {'id': 131, '..."
...,...,...,...
95,Inflammation of gingiva,474,"[{'id': 97, 'probability': 0.54}, {'id': 1008,..."
96,Herpes,82,"[{'id': 97, 'probability': 0.36}, {'id': 62, '..."
97,Allergy to bee or wasp venom,202,"[{'id': 170, 'probability': 0.303}, {'id': 207..."
98,Excessive overweight,97,"[{'id': 157, 'probability': 0.9}, {'id': 154, ..."


In [20]:
# PART ONE
# creates the Diseases.csv used for identification of Diseases later on ( name, id )

#get rid of last collumn
DD = D.drop(['symptoms'], axis=1)
# reverse name,id to id,name
DD = DD[DD.columns[::-1]] 
DD.to_csv("Diseases/Diseases.csv", index=False, encoding='utf8') 

In [21]:
# maximum #symptoms per diesease -> in wc just give a really high number it doesnt matter, only performancewise
symcap = 1000
# Patients per Disease 
patientnumber = 500

In [22]:
# PART TWO
'''
description:
            Function that parses one input into an array.
input:
            str: one row of ['Symptoms'] which looks like that:
                 [{'id': 29, 'probability': 0.08999999}, {'id':...
output: 
            np.array: parsed version of symptominformation:
                 [id1,accuracy1,id2,accuracy2...]
'''
def get_att_for_line(d):
    d = d.split(',')
    matTemp = np.zeros((1,symcap))
    # parse
    for n,i in enumerate(d):
        f = i.partition(':')[2]
        f = f.partition('}')[0]
        matTemp[0][n] = f
    
    return matTemp

In [23]:
'''
description:
            Function that parses the json into a matrix.
input:
            --
output: 
            np.array: matrix looking like :
                    [[id1(disease1),acc1(disease1),id2(disease1)...],
                    [id1(disease2),acc1(diesease2),id2(disease1)..], 
                    ...]
'''
def get_att_matrix():
    mat = np.zeros((D.shape[0],symcap))
    # parse rows
    for index, row in D.iterrows():
        rowstr = get_att_for_line(str(row['symptoms']))
        # put into matrix
        for i in range(rowstr.shape[1]):
            mat[index][i] = rowstr[0][i]
    return mat

In [24]:
# call above written methods to get the symptom matrix
mat = get_att_matrix()
# every row == 1 disease every column one Symptom ID
# IDS is a matrix looking like this: [[id1(disease1), id2(disease1), id3(disease1),...],
#                                     [id1(disease2), id2(disease2), id3(disease2),...],
#                                   ,..,]
IDS = mat[::,0::2]
# every row == 1 disease every column one Acc %
# SYM is a matrix looking like this: [[acc1(disease1), acc2(disease1), acc3(disease1),...],
#                                     [acc1(disease2), acc2(disease2), acc3(disease2),...],
#                                   ,..,]
SYM = mat[::,1::2]

In [25]:
'''
description:
            Function that cuts off unnecessary arrayspots
input:     IDS,SYM,row

           IDS is a matrix looking like this: 
           [[id1(disease1), id2(disease1), id3(disease1),...],
           [id1(disease2), id2(disease2), id3(disease2),...],
            ,..,]
            
           SYM is a matrix looking like this: 
           [[acc1(disease1), acc2(disease1), acc3(disease1),...],
           [acc1(disease2), acc2(disease2), acc3(disease2),...],
           ,..,]
           
           row: int 
                Index of parsed version of symptominformation:
                [id1,accuracy1,id2,accuracy2...
           
output: 
            np.array: matrix looking like :
                    [[id1(disease1),acc1(disease1),id2(disease1)...],
                    [id1(disease2),acc1(diesease2),id2(disease1)..], 
                    ...]
'''
def get_symptoms_and_acc_cut(ids,sym,row):
    # get the indicies of nonNull ID's aka all valid ID's and return the cut arrays 
    IDSS = ids[row].nonzero()
    # get the count of valid Symptoms
    length = len(IDSS[0])
    # cut the array to req length
    IDST = ids[row,0:length:]
    SYMT = sym[row,0:length:]
    # del -1 in ids
    neg = np.argwhere(IDST < 0).flatten()
    IDST = np.delete(IDST,neg)
    SYMT = np.delete(SYMT,neg)
    return IDST,SYMT

In [26]:
''' SYMPTOMS APPLICATION'''

def get_symptoms_used(IDS,SYM):
    IDST,SYMT =  get_symptoms_and_acc_cut(IDS,SYM,0)
    for x in range(1,mat.shape[0]):
        IDSTt,SYMTt =  get_symptoms_and_acc_cut(IDS,SYM,x)
        IDST = np.concatenate((IDST, IDSTt), axis=0)
    IDST =np.unique(IDST,axis=0)
    return IDST

In [27]:
symliste = get_symptoms_used(IDS,SYM)
flist = []
for sss in symliste:
    flist.append(int(sss))
df=pd.read_json('Symptoms/Symptoms.json')
df.to_csv('Symptoms/Symptoms.csv')
patientData_reviews = pd.read_csv('Symptoms/Symptoms.csv')
df=pd.read_csv('Symptoms/Symptoms.csv', index_col=[0])
boolean_series = df.ID.isin(flist)
filtered_df = df[boolean_series]
filtered_df.to_csv('Symptoms/Symptoms.csv',index=None)
symidvec = np.asarray(flist)

In [28]:
def generate_patients(Label,IDST,SYMT,count):
    vec = np.full(count,int(Label))
    for n,i in enumerate(symidvec):
        argdes = np.argwhere(IDST == i).flatten()
        if len(argdes )== 0:
            vec = np.vstack((vec, np.zeros(count,)))
        else:
            ii = argdes[0]
            if SYMT[ii] < 0:
                rz = rd.uniform(0, 0.5)
                vec = np.vstack((vec,rd.choices([1,0], weights=[rz,1-rz],k=count)))
            else:
                vec = np.vstack((vec,rd.choices([1,0], weights=[SYMT[ii],1-SYMT[ii]],k=count)))
    return vec.transpose()

In [29]:
def generate_database_mat(nr):
    IDST,SYMT =  get_symptoms_and_acc_cut(IDS,SYM,0)
    Patients = generate_patients(D['id'][0],IDST,SYMT,nr)
    print(Patients.shape)
    for x in range(1,mat.shape[0]):
        IDST,SYMT =  get_symptoms_and_acc_cut(IDS,SYM,x)
        Patients = np.vstack((Patients, generate_patients(D['id'][x],IDST,SYMT,nr)))
        #ListPatients.append(generate_patients(D['id'][x],IDST,SYMT,nr) )
    return Patients

In [30]:
def get_frames(nr):
    head = np.append([0],symidvec).astype(int)
    ListPatientss = generate_database_mat(nr)
    ListPatientss = np.vstack(( np.append([0],symidvec).astype(int), ListPatientss))
    print(ListPatientss.shape)
    DIS = pd.DataFrame(data=ListPatientss[1:,:],columns=ListPatientss[0,:])
        
    return DIS

In [31]:
kk = get_frames(patientnumber)
kk.to_csv("./Patients/patients{}.csv".format("Patients"), index=False, encoding='utf8')
#data.to_csv("output_excel_file.xlsx", sheet_name="Sheet 1", index=False)

    

(500, 215)
(50001, 215)


In [32]:
npKK = kk
for i in range(10000):
    x = num1 = rd.randint(0, 49999)
    y = num1 = rd.randint(0, 49999)
    w = npKK.iloc[x]
    npKK.iloc[x] = npKK.iloc[y]
    npKK.iloc[y] = w

npKK.to_csv("./Patients/patients{}.csv".format("PatientsMIXED"), index=False, encoding='utf8')


In [33]:
npKK

Unnamed: 0,0.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,...,1001.0,1002.0,1003.0,1004.0,1005.0,1008.0,1009.0,1010.0,1011.0,1014.0
0,80.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,374.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,80.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,80.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,80.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,147.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,437.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,351.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,437.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
