In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.show_dimensions', True)
pd.set_option('display.max_columns', None)
%matplotlib inline
plt.style.use('fivethirtyeight')

# AFTYGH Respiratory Therapy

The goal of this project aims at assisting clinical evaluation and analysis with artificial intelligence in respiratory therapy using the dataset from AFTYGH.

## Physical Examination Dataset

In [None]:
# Load in the dataset
examination = pd.read_excel('/home/yungshun/workspace/py/sklearn-aftygh-respiratory-therapy/datasets/檢驗報告2.xlsx')

In [5]:
# Take at look on first 500 rows
examination.head(500)

Unnamed: 0,病歷號,性別,年齡,醫囑號,來源號,病患來源,採檢位置,收件日期,收件時間,項目,項目名稱,檢體別,檢驗數值,抗生素名,藥敏結果,檢驗註記,危險值註記,開單日期,開單時間,報告日期,報告時間,8349235
0,8349235.0,F,92.0,4082245.0,126012025.0,I,J08F0802C,20120418.0,2315.0,72-721,Gram Stain,SP,Saliva contamination(唾液污染),,,NONE,NONE,20120418.0,1647.0,20120418.0,2319.0,
1,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Color,U,Dark Yellow,,,NONE,NONE,20120423.0,1314.0,20120423.0,1433.0,97506778.0
2,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Turbidity,U,Turbid,,,NONE,NONE,20120423.0,1314.0,20120423.0,1433.0,92399013.0
3,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,SP.Gravity,U,1.023,,,NONE,NONE,20120423.0,1314.0,20120423.0,1433.0,92427800.0
4,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,pH,U,6,,,NONE,NONE,20120423.0,1314.0,20120423.0,1433.0,8889414.0
5,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Leukocyte,U,2+,,,H,NONE,20120423.0,1314.0,20120423.0,1433.0,92197867.0
6,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Nitrite,U,Negative,,,NONE,NONE,20120423.0,1314.0,20120423.0,1433.0,92398510.0
7,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Protein,U,2+ (100),,,H,NONE,20120423.0,1314.0,20120423.0,1433.0,20140218.0
8,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Glucose,U,Negative,,,NONE,NONE,20120423.0,1314.0,20120423.0,1433.0,92124210.0
9,8349235.0,F,92.0,4088359.0,126012025.0,I,J08F0802C,20120423.0,1419.0,72-701,Ketone,U,Trace (5),,,H,NONE,20120423.0,1314.0,20120423.0,1433.0,3523327.0


In [6]:
# Get a sense of how many rows of data there are, if there are any missing values, and what data type each column has
examination.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350534 entries, 0 to 350533
Data columns (total 22 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   病歷號      350534 non-null  object
 1   性別       350534 non-null  object
 2   年齡       350534 non-null  object
 3   醫囑號      350534 non-null  object
 4   來源號      350534 non-null  object
 5   病患來源     350534 non-null  object
 6   採檢位置     350534 non-null  object
 7   收件日期     350534 non-null  object
 8   收件時間     350534 non-null  object
 9   項目       350534 non-null  object
 10  項目名稱     350534 non-null  object
 11  檢體別      350534 non-null  object
 12  檢驗數值     350534 non-null  object
 13  抗生素名     350534 non-null  object
 14  藥敏結果     350534 non-null  object
 15  檢驗註記     350534 non-null  object
 16  危險值註記    350534 non-null  object
 17  開單日期     350534 non-null  object
 18  開單時間     350534 non-null  object
 19  報告日期     350534 non-null  object
 20  報告時間     350534 non-null  object
 21  8349235  1

In [7]:
# Another method to check for missing values
examination.isnull().sum()

病歷號             0
性別              0
年齡              0
醫囑號             0
來源號             0
病患來源            0
採檢位置            0
收件日期            0
收件時間            0
項目              0
項目名稱            0
檢體別             0
檢驗數值            0
抗生素名            0
藥敏結果            0
檢驗註記            0
危險值註記           0
開單日期            0
開單時間            0
報告日期            0
報告時間            0
8349235    348976
Length: 22, dtype: int64

In [8]:
examination_B = examination[examination['檢體別'] == 'B']
examination_B.head(500)

Unnamed: 0,病歷號,性別,年齡,醫囑號,來源號,病患來源,採檢位置,收件日期,收件時間,項目,項目名稱,檢體別,檢驗數值,抗生素名,藥敏結果,檢驗註記,危險值註記,開單日期,開單時間,報告日期,報告時間,8349235
17,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,Temperature,B,37,,,NONE,NONE,20120423,1314,20120423,1408,92141313
18,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,pH,B,7.374,,,NONE,NONE,20120423,1314,20120423,1408,92427106
19,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,pCO2,B,75.7,,,H,H,20120423,1314,20120423,1408,3875936
20,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,pO2,B,71.5,,,L,NONE,20120423,1314,20120423,1408,92378562
21,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,HCO3,B,44.6,,,H,H,20120423,1314,20120423,1408,92375049
22,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,TCO2,B,46.9,,,H,NONE,20120423,1314,20120423,1408,92283116
23,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,ABE,B,15.8,,,H,NONE,20120423,1314,20120423,1408,92152880
24,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,SBE,B,19.1,,,NONE,NONE,20120423,1314,20120423,1408,20744601
25,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,SBC,B,39.7,,,H,NONE,20120423,1314,20120423,1408,92334757
26,8349235,F,92,4088358,126012025,I,J08F0802C,20120423,1355,72-530,SAT,B,92.3,,,L,NONE,20120423,1314,20120423,1408,92348656


In [33]:
examination_B_pivoted = examination_B.pivot(columns='項目名稱', values='檢驗數值')
examination_B_pivoted.head(500)

項目名稱,A-DSDNA,A-HAV IgM,A-HBc IgM,ABE,ABO-Type,ABSOL-CD4,ABSOL-CD8,AFP,ALK-P,ALT/GPT,ANA,APTT,AST/GOT,Abnormal Lympho,Abs Eosin Count,Albumin,Alcohol,Ammonia,Amylase,Amylase(B),Anti-HAV,Anti-HBc,Anti-HBe,Anti-HBs,Anti-HCV,Atypical Type-1,Atypical Type-2,Atypical-Lympho,B-CD19,B-CD19(+),BB,BNP,BUN,BUN (B),Band,Basophil,Blast cell,Bleed Time-IVY,Blood Culture,Blood Ketone,BloodKetone定量,C3,C4,CA-125,CA15-3,CA19-9,CAH,CD4 T CELL,CD4/CD8,CD8 T CELL,CEA,CK,CK-MB,CO HB,CO2,COLD AGGL,CRP,CRYPAG,Calcium,Cholinesterase,Cl,Corr-Diph(洗腎),Corr-Dipheny.,Cortisol,Creatinine,Creatinine(B),D-COOMBS,D-dimer,D. Bilirubin,DC IgG,Digoxin,Diphenylhyda.,Direct Bilirubi,ESR,Eosinophil,Estimated AG,Estimated GFR,FAB,FDP,FSH,Fe,Ferritin,Fibrinogen,Folate,Free PSA,Free PSA Ratio,Free-T4,Fungus Culture,G6PD,GA-I,GAA,Galactose,Glucose(AC),Glucose(PC),HBeAg,HBsAg,HCO3,HCT,HDL-C,HGB,HIV 1+2 Ab,HIV 1+2 Ab Scr.,HS-CRP,Hb-A1c,Hematocrit,Hemoglobin,Hemoglobin H,Homocysteine,Hypersegmented,ICG,IDC,INR,IVA,IgA,IgE,IgG,IgM,Inorganic P,Intact-PTH,K,LDH,LDL-C(calc),LDL-C(direct),LDL-C/HDL-C,LH,Lactate (B),Lipase,Lymphocyte,MB,MCAD,MCH,MCHC,MCV,MET,MET HB,MIC,MM,MMA,MSUD,MYCO-IgM,Megakaryocyte,Meta-Myelocyte,Mg,Monocyte,Myelocyte,Myoglobin,NK-CELL,Na,Non-HDL-C,Nor.plasma mean,Nucleated RBC,O2 HB,O2CT,Osmolality(B),P.T,PLT,PSA,Phenobarbital,Plasma cell,Plasmacytoid,Platelets,Prealbumin,Procalcitonin,Prolactin,Promyelocyte,Pyruvate,RBC,RDW,RF,RHD,RPR,RUB-IgG,Reticulocyte,SAT,SBC,SBE,SCC,SCID,Segment,Sugar,T HB,T-CD3,T-CD3(+),T-CHOL/HDL-C,T-Cholesterol,T-HELP(CD4),T-SUPP(CD8),T3,T4,TCO2,TIBC,TP (B),TPPA,TSH,TSH(NB),Temperature,Testosterone,Theophylline,Total Bilirubin,Total CK,Total Protein,Transferrin,Triglyceride,Troponin-I,Uric Acid,Uric Acid (B),VLDL-C,Valproic acid,Vancomycin,Vit B12,WBC,eAG,pCO2,pH,pO2,phenylalanine,β-HCG,β2-Microglobu,γ-GT,其他TM結果_A
17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,37,,,,,,,,,,,,,,,,,,,,,,,,
18,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.374,,,,,,
19,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,75.7,,,,,,,
20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,71.5,,,,,
21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,44.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350529,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
350530,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,35.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
350531,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.38,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
350532,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,135,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [39]:
examination_B_pivoted.shape

(252865, 220)

In [43]:
examination_B_pivoted_selected = examination_B_pivoted[['Temperature', 'pH', 'pCO2', 'pO2', 'HCO3', 'TCO2', 'ABE', 'SBE', 'SBC', 'SAT', 'Na', 'K', 'Mg', 'Cl', 'BUN (B)', 'Creatinine(B)']]
examination_B_pivoted_selected

項目名稱,Temperature,pH,pCO2,pO2,HCO3,TCO2,ABE,SBE,SBC,SAT,Na,K,Mg,Cl,BUN (B),Creatinine(B)
17,37,,,,,,,,,,,,,,,
18,,7.374,,,,,,,,,,,,,,
19,,,75.7,,,,,,,,,,,,,
20,,,,71.5,,,,,,,,,,,,
21,,,,,44.6,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350529,,,,,,,,,,,,,,,,
350530,,,,,,,,,,,,,,,,
350531,,,,,,,,,,,,,,,,0.38
350532,,,,,,,,,,,135,,,,,


In [34]:
examination_B_selected = examination_B[['病歷號', '收件日期', '收件時間']]
examination_B_selected.head(500)

Unnamed: 0,病歷號,收件日期,收件時間
17,8349235,20120423,1355
18,8349235,20120423,1355
19,8349235,20120423,1355
20,8349235,20120423,1355
21,8349235,20120423,1355
...,...,...,...
350529,92489092,20140602,533
350530,92489092,20140602,533
350531,92489092,20140602,528
350532,92489092,20140602,528


In [40]:
examination_B_selected.shape

(252865, 3)

In [95]:
examination_B_concat = pd.concat([examination_B_selected, examination_B_pivoted_selected])
examination_B_concat.head(500)

Unnamed: 0,病歷號,收件日期,收件時間,Temperature,pH,pCO2,pO2,HCO3,TCO2,ABE,SBE,SBC,SAT,Na,K,Mg,Cl,BUN (B),Creatinine(B)
17,8349235,20120423,1355,,,,,,,,,,,,,,,,
18,8349235,20120423,1355,,,,,,,,,,,,,,,,
19,8349235,20120423,1355,,,,,,,,,,,,,,,,
20,8349235,20120423,1355,,,,,,,,,,,,,,,,
21,8349235,20120423,1355,,,,,,,,,,,,,,,,
22,8349235,20120423,1355,,,,,,,,,,,,,,,,
23,8349235,20120423,1355,,,,,,,,,,,,,,,,
24,8349235,20120423,1355,,,,,,,,,,,,,,,,
25,8349235,20120423,1355,,,,,,,,,,,,,,,,
26,8349235,20120423,1355,,,,,,,,,,,,,,,,


In [96]:
examination_B_merge_rows = examination_B_concat.groupby(['病歷號', '收件日期', '收件時間']).first().reset_index()
examination_B_merge_rows.head(500)

Unnamed: 0,病歷號,收件日期,收件時間,Temperature,pH,pCO2,pO2,HCO3,TCO2,ABE,SBE,SBC,SAT,Na,K,Mg,Cl,BUN (B),Creatinine(B)
0,114400,20130125,714,,,,,,,,,,,,,,,,
1,114400,20130125,1027,,,,,,,,,,,,,,,,
2,114400,20130125,1250,,,,,,,,,,,,,,,,
3,114400,20130125,1255,,,,,,,,,,,,,,,,
4,114400,20130126,959,,,,,,,,,,,,,,,,
5,114400,20130129,449,,,,,,,,,,,,,,,,
6,114400,20130129,451,,,,,,,,,,,,,,,,
7,114400,20130313,1610,,,,,,,,,,,,,,,,
8,114400,20130313,1633,,,,,,,,,,,,,,,,
9,114400,20130313,1809,,,,,,,,,,,,,,,,


In [47]:
# Get a sense of how many rows of data there are, if there are any missing values, and what data type each column has
examination_B_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 505730 entries, 17 to 350533
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   病歷號            252865 non-null  object
 1   收件日期           252865 non-null  object
 2   收件時間           252865 non-null  object
 3   Temperature    2364 non-null    object
 4   pH             2364 non-null    object
 5   pCO2           2364 non-null    object
 6   pO2            2364 non-null    object
 7   HCO3           2364 non-null    object
 8   TCO2           2364 non-null    object
 9   ABE            2364 non-null    object
 10  SBE            2364 non-null    object
 11  SBC            2364 non-null    object
 12  SAT            2364 non-null    object
 13  Na             10287 non-null   object
 14  K              11041 non-null   object
 15  Mg             704 non-null     object
 16  Cl             656 non-null     object
 17  BUN (B)        6881 non-null    object
 18  Cre

In [49]:
# Another method to check for missing values
examination_B_merged.isnull().sum()

病歷號              252865
收件日期             252865
收件時間             252865
Temperature      503366
pH               503366
pCO2             503366
pO2              503366
HCO3             503366
TCO2             503366
ABE              503366
SBE              503366
SBC              503366
SAT              503366
Na               495443
K                494689
Mg               505026
Cl               505074
BUN (B)          498849
Creatinine(B)    497691
Length: 19, dtype: int64

In [None]:
examination_B_merged_

In [94]:
examination_B_merged_null = examination_B_merged.loc[examination_B_merged['病歷號'].isnull()]
examination_B_merged_null

Unnamed: 0,病歷號,收件日期,收件時間,Temperature,pH,pCO2,pO2,HCO3,TCO2,ABE,SBE,SBC,SAT,Na,K,Mg,Cl,BUN (B),Creatinine(B)
17,,,,37,,,,,,,,,,,,,,,
18,,,,,7.374,,,,,,,,,,,,,,
19,,,,,,75.7,,,,,,,,,,,,,
20,,,,,,,71.5,,,,,,,,,,,,
21,,,,,,,,44.6,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350529,,,,,,,,,,,,,,,,,,,
350530,,,,,,,,,,,,,,,,,,,
350531,,,,,,,,,,,,,,,,,,,0.38
350532,,,,,,,,,,,,,,135,,,,,


## QRTISX Dataset

In [51]:
# Load in the dataset
qrtisx = pd.read_excel('/home/yungshun/workspace/py/sklearn-aftygh-respiratory-therapy/datasets/QRTISX.xlsx')

In [52]:
# Take at look on first 500 rows
qrtisx.head(500)

Unnamed: 0,病歷號,住院號,使用天數,mode,peep,fio2,tidal volume,respiratory rate,minute ventilation,spo2,rsbi,pi-max,pe-max,cuff leak test,compliance,resistance,記錄時間
0,8349235,126012025,35.604167,PCV,8.0,80,414,24.0,10.0,100,,,,,,,201204260930
1,8349235,126012025,35.604167,PCV,8.0,70,347,24.0,7.2,100,,,,,,,201204261320
2,8349235,126012025,35.604167,PCV,8.0,70,392,24.0,8.1,98,,,,,,,201204261630
3,8349235,126012025,35.604167,PCV,8.0,65,394,24.0,8.3,98,,,,,,,201204261825
4,8349235,126012025,35.604167,PCV,8.0,60,307,24.0,7.4,100,,,,,,,201204270032
5,8349235,126012025,35.604167,PCV,8.0,55,414,20.0,8.2,98,,,,,,,201204270935
6,8349235,126012025,35.604167,PCV,8.0,50,414,16.0,6.7,96,,,,,,,201204271040
7,8349235,126012025,35.604167,PCV,8.0,50,433,16.0,7.2,96,,,,,,,201204271450
8,8349235,126012025,35.604167,PCV,8.0,45,407,16.0,7.3,100,,,,,,,201204271642
9,8349235,126012025,35.604167,PCV,8.0,45,356,16.0,7.3,100,,,,,,,201204280055


In [53]:
# Get a sense of how many rows of data there are, if there are any missing values, and what data type each column has
qrtisx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57640 entries, 0 to 57639
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   病歷號                 57640 non-null  int64  
 1   住院號                 57640 non-null  int64  
 2   使用天數                57640 non-null  float64
 3   mode                57235 non-null  object 
 4   peep                51785 non-null  object 
 5   fio2                56112 non-null  object 
 6   tidal volume        48835 non-null  object 
 7   respiratory rate    51790 non-null  object 
 8   minute ventilation  49417 non-null  object 
 9   spo2                57020 non-null  object 
 10  rsbi                2359 non-null   object 
 11  pi-max              2321 non-null   object 
 12  pe-max              10 non-null     float64
 13  cuff leak test      0 non-null      float64
 14  compliance          2411 non-null   float64
 15  resistance          2418 non-null   float64
 16  記錄時間

In [54]:
# Another method to check for missing values
qrtisx.isnull().sum()

病歷號                       0
住院號                       0
使用天數                      0
mode                    405
peep                   5855
fio2                   1528
tidal volume           8805
respiratory rate       5850
minute ventilation     8223
spo2                    620
rsbi                  55281
pi-max                55319
pe-max                57630
cuff leak test        57640
compliance            55229
resistance            55222
記錄時間                      0
Length: 17, dtype: int64

In [87]:
qrtisx_selected = qrtisx[['病歷號', 'mode', 'peep', 'fio2', 'tidal volume', 'respiratory rate', 'minute ventilation', 'spo2', '記錄時間']]
qrtisx_selected.head(500)

Unnamed: 0,病歷號,mode,peep,fio2,tidal volume,respiratory rate,minute ventilation,spo2,記錄時間
0,8349235,PCV,8.0,80,414,24.0,10.0,100,201204260930
1,8349235,PCV,8.0,70,347,24.0,7.2,100,201204261320
2,8349235,PCV,8.0,70,392,24.0,8.1,98,201204261630
3,8349235,PCV,8.0,65,394,24.0,8.3,98,201204261825
4,8349235,PCV,8.0,60,307,24.0,7.4,100,201204270032
5,8349235,PCV,8.0,55,414,20.0,8.2,98,201204270935
6,8349235,PCV,8.0,50,414,16.0,6.7,96,201204271040
7,8349235,PCV,8.0,50,433,16.0,7.2,96,201204271450
8,8349235,PCV,8.0,45,407,16.0,7.3,100,201204271642
9,8349235,PCV,8.0,45,356,16.0,7.3,100,201204280055


In [88]:
qrtisx_selected['病歷號'] = qrtisx_selected['病歷號'].astype(str)
qrtisx_selected['記錄時間'] = qrtisx_selected['記錄時間'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qrtisx_selected['病歷號'] = qrtisx_selected['病歷號'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qrtisx_selected['記錄時間'] = qrtisx_selected['記錄時間'].astype(str)


In [89]:
qrtisx_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57640 entries, 0 to 57639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   病歷號                 57640 non-null  object
 1   mode                57235 non-null  object
 2   peep                51785 non-null  object
 3   fio2                56112 non-null  object
 4   tidal volume        48835 non-null  object
 5   respiratory rate    51790 non-null  object
 6   minute ventilation  49417 non-null  object
 7   spo2                57020 non-null  object
 8   記錄時間                57640 non-null  object
dtypes: object(9)
memory usage: 4.0+ MB


In [90]:
qrtisx_selected['收件日期'] = qrtisx_selected['記錄時間'].apply(lambda x: x[:8])
qrtisx_selected['收件時間'] = qrtisx_selected['記錄時間'].apply(lambda x: x[-4:].lstrip('0'))
qrtisx_selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qrtisx_selected['收件日期'] = qrtisx_selected['記錄時間'].apply(lambda x: x[:8])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qrtisx_selected['收件時間'] = qrtisx_selected['記錄時間'].apply(lambda x: x[-4:].lstrip('0'))


Unnamed: 0,病歷號,mode,peep,fio2,tidal volume,respiratory rate,minute ventilation,spo2,記錄時間,收件日期,收件時間
0,8349235,PCV,8,80,414,24,10,100,201204260930,20120426,930
1,8349235,PCV,8,70,347,24,7.2,100,201204261320,20120426,1320
2,8349235,PCV,8,70,392,24,8.1,98,201204261630,20120426,1630
3,8349235,PCV,8,65,394,24,8.3,98,201204261825,20120426,1825
4,8349235,PCV,8,60,307,24,7.4,100,201204270032,20120427,32
...,...,...,...,...,...,...,...,...,...,...,...
57635,92489092,S+PS,8,60,603,17,9.92,100,201405302100,20140530,2100
57636,92489092,PS,8,40,488,20,9.5,100,201405302227,20140530,2227
57637,92489092,PS,8,35,394,21,7.81,100,201405310001,20140531,1
57638,92489092,PS,8,35,394,21,7.81,,201405310908,20140531,908


In [None]:
# examination_B_merged['收件日期'] 
for i in range(0, len(examination_B_merged))