In [1]:
import pandas as pd
import numpy as np

In [2]:
general = pd.read_csv('test/general.csv')
prenatal = pd.read_csv('test/prenatal.csv')
sports = pd.read_csv('test/sports.csv')

pd.set_option('display.max_columns', 8)

In [3]:
prenatal.columns = general.columns
sports.columns = general.columns


In [4]:
combined_data = pd.concat([general, prenatal, sports], ignore_index=True).iloc[:, 1:]

combined_data.dropna(inplace=True, how='all')

combined_data.gender = combined_data.gender.apply(lambda x: 'm' if x in ['male', 'man'] else x)
combined_data.gender = combined_data.gender.apply(lambda x: 'f' if x in ['female', 'woman'] else x)
combined_data.loc[combined_data.hospital == 'prenatal', 'gender'] = 'f'

for column in ['bmi', 'diagnosis', 'blood_test', 'ecg', 'ultrasound', 'mri', 'xray', 'children', 'months']:
    combined_data.loc[combined_data[column].isnull(), column] = 0 

In [5]:
print(combined_data.sample(n=20, random_state=30))


     hospital gender   age  height  ...  mri  xray children months
929    sports      f  23.0   6.809  ...    t     f      0.0    0.0
927    sports      m  21.0   6.052  ...    t     f      0.0    0.0
516  prenatal      f  20.0   1.650  ...    0     f      1.0    4.0
87    general      m  54.0   1.720  ...    0     0      0.0    0.0
885    sports      f  16.0   5.915  ...    t     f      0.0    0.0
463  prenatal      f  34.0   1.650  ...    0     f      1.0    5.0
112   general      m  77.0   1.690  ...    0     0      0.0    0.0
297   general      m  56.0   1.480  ...    0     0      0.0    0.0
417   general      f  26.0   1.650  ...    0     0      0.0    0.0
660  prenatal      f  38.0   1.590  ...    0     f      1.0    4.0
344   general      f  60.0   1.410  ...    0     0      0.0    0.0
834    sports      f  21.0   5.585  ...    f     t      0.0    0.0
10    general      m  27.0   1.850  ...    0     0      0.0    0.0
56    general      m  23.0   1.650  ...    0     0      0.0   

    1. Which hospital has the highest number of patients?
    2. What share of the patients in the general hospital suffers from stomach-related issues? Round the result to the third decimal place.
    3. What share of the patients in the sports hospital suffers from dislocation-related issues? Round the result to the third decimal place.
    4. What is the difference in the median ages of the patients in the general and sports hospitals?
    5. After data processing at the previous stages, the blood_test column has three values: t= a blood test was taken, f= a blood test wasn't taken, and 0= there is no information. In which hospital the blood test was taken the most often (there is the biggest number of t in the blood_test column among all the hospitals)? How many blood tests were taken?

In [6]:
combined_data.groupby('hospital').size().nlargest(1)


hospital
general    461
dtype: int64

In [7]:
general_patients = combined_data.loc[combined_data['hospital'] == 'general']

(general_patients.groupby('diagnosis').size() / general_patients.shape[0])['stomach'].round(3)

0.325

In [8]:
sports_patients = combined_data.loc[combined_data['hospital'] == 'sports']

(sports_patients.groupby('diagnosis').size() / sports_patients.shape[0])['dislocation'].round(3)

0.285

In [9]:
general_patients.age.median() - sports_patients.age.median()

19.0

In [14]:
combined_data.groupby(['hospital', 'blood_test']).size()

hospital  blood_test
general   0              72
          f             206
          t             183
prenatal  t             325
sports    0             214
dtype: int64

0      sports
1      sports
2      sports
3      sports
4      sports
        ...  
212    sports
213    sports
214    sports
215    sports
216    sports
Name: hospital, Length: 217, dtype: object