# 3. 판다스 데이터 전처리 

## 3-1  데이터 확인하기 

### 모듈 사용하기 

In [1]:
import pandas as pd

In [2]:
import numpy as np

### 파일 읽어오기 

In [3]:
df = pd.read_csv("../data/diabetic_data_new.csv")

### 문자열을 널값으로 변환하기 

In [4]:
df_1 = df.replace("?", np.nan).copy()

### 데이터 상태 확인하기 

In [5]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      99493 non-null   object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    3197 non-null    object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                61510 non-null   object
 11  medical_specialty         51817 non-null   object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

## 3-2  널값 처리 

### 널값을 확인하기 

In [6]:
df_1.isna().sum()

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

### 널값이 있는 경우만 칼럼추출하기 

In [7]:
df_col_null = df_1.columns[df_1.isna().sum() > 0 ]

In [8]:
df_1[df_col_null].isna().sum()

race                  2273
weight               98569
payer_code           40256
medical_specialty    49949
diag_1                  21
diag_2                 358
diag_3                1423
dtype: int64

In [9]:
df_1[df_col_null].dtypes

race                 object
weight               object
payer_code           object
medical_specialty    object
diag_1               object
diag_2               object
diag_3               object
dtype: object

## 3-2 문자열 자료 변형하기

- 판다스는 칼럼으로 조회하면 .str 객체를 제공해서 문자열을 처리하는 다양한 메서드를 제공한다.


### 문자열 자료형 칼럼만 조회하기 

In [10]:
df_str = df.select_dtypes(include='object')

In [11]:
df_str.columns

Index(['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty',
       'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

### 성별 구분 확인하기

- 남성/여성이외의 값이 들어왔다. 

In [12]:
df['gender'].value_counts()

Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64

In [13]:
ss = df['gender'].replace('Unknown/Invalid', 'other' )

In [14]:
ss.value_counts()

Female    54708
Male      47055
other         3
Name: gender, dtype: int64

### 특정 범주형 값을 확인하기 

In [15]:
df['diag_1'].value_counts() 

428    6862
414    6581
786    4016
410    3614
486    3508
       ... 
373       1
314       1
684       1
217       1
V51       1
Name: diag_1, Length: 717, dtype: int64

### 내부에 문자와 숫자가 섞여있다.  

In [16]:
xx = df['diag_1'].value_counts().index

In [17]:
aa = []
for i in xx :
    if i[0].isdecimal() :
        continue
    else :
        aa.append(i)

In [18]:
aa

['V57',
 'V58',
 'V55',
 'V54',
 'V53',
 '?',
 'V56',
 'V71',
 'V63',
 'V45',
 'V26',
 'V66',
 'V07',
 'E909',
 'V70',
 'V43',
 'V60',
 'V25',
 'V67',
 'V51']

### 문자에 대한 갯수를 확인 

In [19]:
df['diag_1'].str.startswith("V").sum()

1644

In [20]:
df['diag_1'].str.startswith("E").sum()

1

### 각각의 문자를 변환하고 개수를 확인 

In [21]:
df['diag_1'].str.replace("V","0").str.startswith("V").sum()

0

In [22]:
df['diag_1'].str.replace("E","0").str.startswith("E").sum()

0

## 3-3 날짜 자료 변형하기 

- 날짜를 처리할 수 있도록 .dt 객체를 제공한다 

### 날짜가 필요한 데이터를 읽기

In [23]:
df_day = pd.read_csv("../data/covid/day_wise.csv")

### 데이터프레임 정보 확인하기 

In [24]:
df_day.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    188 non-null    object 
 1   Confirmed               188 non-null    int64  
 2   Deaths                  188 non-null    int64  
 3   Recovered               188 non-null    int64  
 4   Active                  188 non-null    int64  
 5   New cases               188 non-null    int64  
 6   New deaths              188 non-null    int64  
 7   New recovered           188 non-null    int64  
 8   Deaths / 100 Cases      188 non-null    float64
 9   Recovered / 100 Cases   188 non-null    float64
 10  Deaths / 100 Recovered  188 non-null    float64
 11  No. of countries        188 non-null    int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 17.8+ KB


### 칼럼에 대한 날짜 타입으로 변환한다

In [25]:
df_day['Date'] = pd.to_datetime(df_day['Date'])

In [26]:
df_day['Date'].dtype

dtype('<M8[ns]')

### 날짜 자료형으로 변환되어 dt 객체를 활용할 수 있다

In [27]:
df_day['Date'].dt

<pandas.core.indexes.accessors.DatetimeProperties object at 0x12046f550>

In [28]:
df_day['Date'].dt.date

0      2020-01-22
1      2020-01-23
2      2020-01-24
3      2020-01-25
4      2020-01-26
          ...    
183    2020-07-23
184    2020-07-24
185    2020-07-25
186    2020-07-26
187    2020-07-27
Name: Date, Length: 188, dtype: object

### 새로운 칼럼 추가 

- dt 내의 날짜 관련 정보를 분리해서 새로운 칼럼을 만들기

In [29]:
df_day['year']       = df_day['Date'].dt.year         # 연(4자리숫자)
df_day['month']      = df_day['Date'].dt.month        # 월(숫자)
df_day['month_name'] = df_day['Date'].dt.month_name() # 월(문자)

df_day['day']        = df_day['Date'].dt.day          # 일(숫자)
df_day['time']       = df_day['Date'].dt.time         # HH:MM:SS(문자)
df_day['hour']       = df_day['Date'].dt.hour         # 시(숫자)
df_day['minute']     = df_day['Date'].dt.minute       # 분(숫자)
df_day['second']     = df_day['Date'].dt.second       # 초(숫자)


In [30]:
df_day.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries,year,month,month_name,day,time,hour,minute,second
0,2020-01-22,555,17,28,510,0,0,0,3.06,5.05,60.71,6,2020,1,January,22,00:00:00,0,0,0
1,2020-01-23,654,18,30,606,99,1,2,2.75,4.59,60.0,8,2020,1,January,23,00:00:00,0,0,0
2,2020-01-24,941,26,36,879,287,8,6,2.76,3.83,72.22,9,2020,1,January,24,00:00:00,0,0,0
3,2020-01-25,1434,42,39,1353,493,16,3,2.93,2.72,107.69,11,2020,1,January,25,00:00:00,0,0,0
4,2020-01-26,2118,56,52,2010,684,14,13,2.64,2.46,107.69,13,2020,1,January,26,00:00:00,0,0,0


In [31]:
df_day.columns

Index(['Date', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'New cases',
       'New deaths', 'New recovered', 'Deaths / 100 Cases',
       'Recovered / 100 Cases', 'Deaths / 100 Recovered', 'No. of countries',
       'year', 'month', 'month_name', 'day', 'time', 'hour', 'minute',
       'second'],
      dtype='object')

### 새로운 칼럼 추가 

In [32]:
df_day['Deaths ratio'] = df_day['Deaths'] / df_day['Confirmed']

### 그룹화해서 정보를 확인하기 

In [33]:
df_day.groupby(["year","month"])[['Confirmed','Deaths','Deaths ratio']].max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Deaths ratio
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020,1,9927,213,0.030631
2020,2,85306,2936,0.034417
2020,3,871355,44478,0.051045
2020,4,3268876,234704,0.071814
2020,5,6185530,373606,0.07148
2020,6,10449697,511210,0.059973
2020,7,16480485,654036,0.048392


## 4. 특정 내부의 값을 변경하기

-  transform, apply 메서드를 통해 내부의 값을 변경하기

## apply 함수 처리 기준 
- apply(func, axis=0)func: DataFrame의 축을 따라 함수를 호출한다  
- 기본적으로 하나의 시리즈 객체 단위로 계산한다 
- func주어진 축을 따라 적용한 결과를 반환한다 .

## transform 함수 처리 기준
- transform(func, axis=0): 변환된 값으로 DataFrame을 자체적으로func 생성 하는 함수를 호출한다.
- 데이터프레임 단위로 처리하므로 시리즈 단위로 처리하는 함수를 호출하면 예외가 발생한다. 
-  self 와 길이가 같은 DataFrame을 반환한다

## 4-1 지정된 함수를 사용해서 값을 변경하기 

### 실수 칼럼만 가져오기 

In [34]:
df_num = df_day.select_dtypes(include="float")

### 내장함수를 문자열로 전달해서 처리하기

In [35]:
df_num.transform('sqrt').head()

Unnamed: 0,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Deaths ratio
0,1.749286,2.247221,7.791662,0.175016
1,1.658312,2.142429,7.745967,0.1659
2,1.661325,1.957039,8.498235,0.166223
3,1.711724,1.649242,10.377379,0.171139
4,1.624808,1.568439,10.377379,0.162604


In [36]:
df_num.apply('sqrt').head()

Unnamed: 0,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Deaths ratio
0,1.749286,2.247221,7.791662,0.175016
1,1.658312,2.142429,7.745967,0.1659
2,1.661325,1.957039,8.498235,0.166223
3,1.711724,1.649242,10.377379,0.171139
4,1.624808,1.568439,10.377379,0.162604


### 넘파이 모듈의 함수를 사용해서 계산하기 

In [37]:
df_num.transform([np.sqrt, np.exp]).head()

Unnamed: 0_level_0,Deaths / 100 Cases,Deaths / 100 Cases,Recovered / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Deaths / 100 Recovered,Deaths ratio,Deaths ratio
Unnamed: 0_level_1,sqrt,exp,sqrt,exp,sqrt,exp,sqrt,exp
0,1.749286,21.327557,2.247221,156.022464,7.791662,2.322833e+26,0.175016,1.031105
1,1.658312,15.642632,2.142429,98.49443,7.745967,1.142007e+26,0.1659,1.027905
2,1.661325,15.799843,1.957039,46.062538,8.498235,2.3160480000000004e+31,0.166223,1.028015
3,1.711724,18.72763,1.649242,15.180322,10.377379,5.877231e+46,0.171139,1.029722
4,1.624808,14.013204,1.568439,11.704812,10.377379,5.877231e+46,0.162604,1.026793


In [38]:
df_num.apply([np.sqrt, np.exp]).head()

Unnamed: 0_level_0,Deaths / 100 Cases,Deaths / 100 Cases,Recovered / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Deaths / 100 Recovered,Deaths ratio,Deaths ratio
Unnamed: 0_level_1,sqrt,exp,sqrt,exp,sqrt,exp,sqrt,exp
0,1.749286,21.327557,2.247221,156.022464,7.791662,2.322833e+26,0.175016,1.031105
1,1.658312,15.642632,2.142429,98.49443,7.745967,1.142007e+26,0.1659,1.027905
2,1.661325,15.799843,1.957039,46.062538,8.498235,2.3160480000000004e+31,0.166223,1.028015
3,1.711724,18.72763,1.649242,15.180322,10.377379,5.877231e+46,0.171139,1.029722
4,1.624808,14.013204,1.568439,11.704812,10.377379,5.877231e+46,0.162604,1.026793


### 특정 칼럼별로 함수를 지정해서 딕셔너리로 전달하기 

In [39]:
df_num.transform({
    'Deaths / 100 Cases': np.sqrt,
    'Recovered / 100 Cases': np.exp,
}).head()

Unnamed: 0,Deaths / 100 Cases,Recovered / 100 Cases
0,1.749286,156.022464
1,1.658312,98.49443
2,1.661325,46.062538
3,1.711724,15.180322
4,1.624808,11.704812


In [40]:
df_num.apply({
    'Deaths / 100 Cases': np.sqrt,
    'Recovered / 100 Cases': np.exp,
}).head()

Unnamed: 0,Deaths / 100 Cases,Recovered / 100 Cases
0,1.749286,156.022464
1,1.658312,98.49443
2,1.661325,46.062538
3,1.711724,15.180322
4,1.624808,11.704812


## 4-2 특정함수를 지정해서 처리하기

### 람다함수로 처리하기 

In [41]:
df_num.apply(lambda x:x.sum())

Deaths / 100 Cases         913.80000
Recovered / 100 Cases     6456.66000
Deaths / 100 Recovered    4155.65000
Deaths ratio                 9.13825
dtype: float64

### transform은 데이터프레임단위로 출력해서 예외발생

-  

In [42]:
try : 
    df_num.transform(lambda x:x.sum())
except ValueError as e :
    print(e)

Function did not transform


### 두 개의 칼럼(시리즈) 간의 함수 연산 처리하기 

In [43]:
def subtract_two(x):
    return x['Deaths / 100 Cases'] - x['Recovered / 100 Cases']

In [44]:
df_num.apply(subtract_two, axis=1)

0      -1.99
1      -1.84
2      -1.07
3       0.21
4       0.18
       ...  
183   -52.08
184   -52.56
185   -53.05
186   -53.19
187   -53.48
Length: 188, dtype: float64

### 한 번에 하나의 시리즈에서만 작업할 수 있기 때문에  예외 발생 

In [45]:
try :
    df_num.transform(subtract_two, axis=1)
except ValueError as e :
    print(e)

Function did not transform
