# 패키지 설치

In [1]:
!pip install mySUNI

zsh:1: command not found: pip


## 모듈 import

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from mySUNI import cds
from IPython.display import Image

## 데이터셋 로드

In [3]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


**컬럼(columns) 설명**

- survivied: 생존여부 (1: 생존, 0: 사망)
- pclass: 좌석 등급 (1등급, 2등급, 3등급)
- sex: 성별
- age: 나이
- sibsp: 형제 + 배우자 수
- parch: 부모 + 자녀 수
- fare: 좌석 요금
- embarked: 탑승 항구 (S, C, Q)
- class: pclass와 동일
- who: 성별과 동일
- adult_male: 성인 남자 여부
- deck: 데크 번호 (알파벳 + 숫자 혼용)
- embark_town: 탑승 항구 이름
- alive: 생존여부 (yes, no)
- alone: 혼자 탑승 여부

## copy

DataFrame을 **복제**합니다. 복제한 DataFrame을 수정해도 **원본에는 영향을 미치지 않습니다.**

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


`copy()`로 DataFrame을 복제합니다.

In [53]:
df_copy = df.copy()

id 값을 확인하면 두 DataFrame의 **메모리 주소가 다름**을 확인할 수 있습니다.

In [6]:
id(df), id(df_copy)

(6107146192, 6107143360)

In [7]:
df_copy.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


`df_copy`의 `age`를 99999로 임의 수정하도록 하겠습니다.

In [54]:
df_copy.loc[0, 'age'] = 99999

수정사항이 반영된 것을 확인할 수 있습니다.

In [9]:
df_copy.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,99999.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


하지만, 원본 DataFrame의 **데이터는 변경되지 않고 그대로 남아** 있습니다.

In [10]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# 결측치

결측치는 **비어있는 데이터**를 의미합니다.

결측치에 대한 처리는 매우 중요합니다. 

결측치에 대한 처리를 해주려면 **다음의 내용**을 반드시 알아야 합니다.

1. 결측 데이터 확인
2. 결측치가 **아닌** 데이터 확인
3. 결측 데이터 **채우기**
4. 결측 데이터 **제거하기**

## 결측치 확인 - isnull(), isna()

컬럼(column)별 결측치의 갯수를 확인하기 위해서는 `sum()` 함수를 붙혀주면 됩니다.

`sum()`은 Pandas의 통계 관련 함수이며, 통계 관련 함수는 추후에 더 자세히 알아볼 예정입니다.

**isnull()**

In [11]:
df.isnull().sum() #axis=0

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

**isna()**

isnull() 과 동작이 완전 같습니다. 편한 것으로 써주세요. (심지어 도큐먼트도 같습니다)

In [12]:
df.isna() # Boolean DataFrame -> 조건 인덱싱으로 잘 사용하지 않음 
df.isna().sum()[df.isna().sum() != 0] # NA가 존재하는 컬럼과 개수 출력
df.isna().mean() # 결측치 비율 

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

DataFrame 전체 결측 데이터의 갯수를 합산하기 위해서는 `sum()`을 두 번 사용하면 됩니다.

In [13]:
df.isnull().sum().sum() 

869

## 결측치가 아닌 데이터 확인 - notnull()

`notnull()`은 `isnull()`과 정확히 **반대** 개념입니다.

In [14]:
df.notnull().sum()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

## 결측 데이터 필터링

`isnull()` 함수가 결측 데이터를 찾는 **boolean index** 입니다.

즉, `loc`에 적용하여 조건 필터링을 걸 수 있습니다.

In [15]:
df.loc[df['age'].isnull()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True


## 결측치 채우기 - fillna()

`fillna()`를 활용하면 결측치에 대하여 **일괄적으로 값을 채울 수** 있습니다.

In [16]:
# 원본을 copy하여 df1 변수에 
df1 = df.copy()

In [17]:
df1.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


888번 index의 **결측치가 700으로 채워**진 것을 확인할 수 있습니다.

In [18]:
df1['age'].fillna(700).tail()

886     27.0
887     19.0
888    700.0
889     26.0
890     32.0
Name: age, dtype: float64

In [46]:
df1['age'] = df1['age'].fillna(700)

In [48]:
df1.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,700.0,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [21]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


**카테고리 형 데이터**을 채워주기 위해서는 다음과 같은 과정을 거쳐야 합니다.

이미 카테고리가 추가된 'A'나 'B'는 바로 fillna() 할 수 있습니다.

In [49]:
df1['deck'].fillna('A')

0      A
1      C
2      A
3      C
4      A
      ..
886    A
887    B
888    A
889    C
890    A
Name: deck, Length: 891, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

하지만, 없는 카테고리로 채워주고자 할 때는 먼저 `add_categories`로 카테고리를 추가한 후 채워야 합니다.

In [23]:
# add_categories (카테고리 추가)
# cat은 category의 지정자
df1['deck'].cat.add_categories('No Data').fillna('No Data')

0      No Data
1            C
2      No Data
3            C
4      No Data
        ...   
886    No Data
887          B
888    No Data
889          C
890    No Data
Name: deck, Length: 891, dtype: category
Categories (8, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'No Data']

## 통계값으로 채우기

In [24]:
df1 = df.copy()

In [25]:
df1.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


### 평균으로 채우기

In [26]:
df1['age'].fillna(df1['age'].mean()).tail()

886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, dtype: float64

### 중앙값으로 채우기

In [27]:
df1['age'].fillna(df1['age'].median()).tail()

886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, dtype: float64

### 최빈값으로 채우기

In [28]:
df1['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

**최빈값(mode)**으로 채울 때에는 반드시 **0번째 index 지정**하여 값을 추출한 후 채워야 합니다.

In [29]:
df1['deck'].mode()[0]

'C'

In [30]:
df1['deck'].fillna(df1['deck'].mode()[0]).tail()

886    C
887    B
888    C
889    C
890    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [50]:
df1.fillna(df1.mean()) # 각 컬럼별 NA를 각 컬럼별 평균으로 대체
df1.fillna(df1.median()) # 각 컬럼별 NA를 각 컬럼별 중앙값으로 대체 
# 열 인덱스를 기준으로 채움
df.fillna(df.median())

  df1.fillna(df1.mean()) # 각 컬럼별 NA를 각 컬럼별 평균으로 대체
  df1.fillna(df1.median()) # 각 컬럼별 NA를 각 컬럼별 중앙값으로 대체
  df.fillna(df.median())


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [52]:
df.fillna()

AttributeError: 'DataFrame' object has no attribute 'dillna'

## NaN 값이 있는 데이터 제거하기 (dropna)

In [55]:
df1 = df.copy()

In [56]:
df1.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [57]:
df1.dropna(how='any', axis=0) # na가 1개라도 있는 행 제거 -> 삭제가 많이 됨
df1.dropna(how='all', axis=0) # 모두 na인 행 제거 -> 삭제 1개도 안됨

df1.dropna(how='any', axis=1) # na가 1개라도 있는 열 제거 -> 삭제가 많이 됨
df1.dropna(how='all', axis=1) # 모두 na인 열 제거  -> 삭제 1개도 안됨

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


`dropna()`로 **1개 라도 NaN 값이 있는 행**은 제거할 수 있스빈다. (`how='any'`)

In [35]:
df1.dropna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


기본 옵션 값은 `how=any`로 설정되어 있으며, 다음과 같이 변경할 수 있습니다.

- **any**: 1개 라도 NaN값이 존재시 drop
- **all**: 모두 NaN값이 존재시 drop

In [36]:
df1.dropna(how='all')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# 연습문제

data폴더에 있는 mySUNI_1.csv를 읽어 df에 저장하세요.

In [63]:
import os
print(os.getcwd())

/Users/yong19/Desktop/python19/first-project-1/study/mysuni/pandas


In [78]:
# 코드를 입력하세요.
df = pd.read_csv('/Users/yong19/Desktop/python19/first-project-1/study/mysuni/pandas/data/mySUNI_1.csv')
df

Unnamed: 0,지역,GS칼텍스_셀프,GS칼텍스_일반,S-OIL_셀프,S-OIL_일반,SK에너지_셀프,SK에너지_일반,알뜰(ex)_셀프,알뜰주유소_셀프,알뜰주유소_일반,자가상표_셀프,자가상표_일반,현대오일뱅크_셀프,현대오일뱅크_일반
0,서울 강남구,1479.628286,1647.3725,1459.241304,1641.747917,1452.455417,1790.737444,,,,,,1408.8225,1604.995806
1,서울 강동구,1451.55875,1472.213056,1456.763333,1528.7025,1457.974359,1666.781667,,,,,,1412.656667,1441.861667
2,서울 강북구,1342.991667,1373.801667,1325.855,1370.8775,1375.390244,1347.2625,,,,,,1353.985814,
3,서울 강서구,1403.7525,1429.829167,1420.370833,,1377.233636,1485.694,,1326.212083,1313.838333,,,1339.844737,1406.236939
4,서울 관악구,1425.005238,1472.719583,,1435.575417,1439.986667,1466.670278,,1330.8375,1390.558333,,,1420.11625,1453.77875
5,서울 광진구,,1409.874667,1388.13375,1375.815833,1468.312,1414.259167,,,1354.878333,,,1372.834231,1376.601667
6,서울 구로구,1349.338333,1580.261111,1348.405,1441.733833,,1417.576415,,,1385.675833,,,1365.516522,1422.016774
7,서울 금천구,1382.317059,,1383.742083,1357.099167,,1427.995122,,1304.87,1372.938333,,,1382.120417,1386.201429
8,서울 노원구,1381.040833,1459.922778,1412.337778,,1407.652917,1509.330833,,,,,,,1467.67
9,서울 도봉구,1358.41875,1404.916667,1375.605333,1523.638333,1349.851463,1478.19,,,,,,1388.036452,1344.847742


각 열별 결측치의 개수를 조회하세요.

In [80]:
# 코드를 입력하세요.
df.isna().sum(axis=0)

지역            0
GS칼텍스_셀프      5
GS칼텍스_일반      1
S-OIL_셀프      5
S-OIL_일반      5
SK에너지_셀프      4
SK에너지_일반      1
알뜰(ex)_셀프    24
알뜰주유소_셀프     20
알뜰주유소_일반     18
자가상표_셀프      24
자가상표_일반      23
현대오일뱅크_셀프     4
현대오일뱅크_일반     1
dtype: int64

각 행별 결측치의 개수를 조회하세요.

In [81]:
# 코드를 입력하세요.
df.isna().sum(axis=1)

0      5
1      5
2      6
3      4
4      4
5      5
6      5
7      5
8      7
9      5
10     5
11     6
12     6
13     5
14     4
15     4
16     4
17     5
18     5
19     4
20    10
21     4
22     8
23     8
24     6
dtype: int64

전체 NA 개수를 계산하세요.

In [82]:
# 코드를 입력하세요.
df.isna().sum().sum()

135

각 열별 결측치의 평균(비율)을 계산하세요.

In [84]:
# 코드를 입력하세요.
df.isna().mean()

지역           0.00
GS칼텍스_셀프     0.20
GS칼텍스_일반     0.04
S-OIL_셀프     0.20
S-OIL_일반     0.20
SK에너지_셀프     0.16
SK에너지_일반     0.04
알뜰(ex)_셀프    0.96
알뜰주유소_셀프     0.80
알뜰주유소_일반     0.72
자가상표_셀프      0.96
자가상표_일반      0.92
현대오일뱅크_셀프    0.16
현대오일뱅크_일반    0.04
dtype: float64

데이터의 개수가 15개 미만인 컬럼을 제거 후 다시 df 변수에 저장하세요.
- 의미 : 결측치 비율이 0.6 이상인 컬럼 제거

In [86]:
# 코드를 입력하세요.
df.dropna(axis=1, thresh=15)

Unnamed: 0,지역,GS칼텍스_셀프,GS칼텍스_일반,S-OIL_셀프,S-OIL_일반,SK에너지_셀프,SK에너지_일반,현대오일뱅크_셀프,현대오일뱅크_일반
0,서울 강남구,1479.628286,1647.3725,1459.241304,1641.747917,1452.455417,1790.737444,1408.8225,1604.995806
1,서울 강동구,1451.55875,1472.213056,1456.763333,1528.7025,1457.974359,1666.781667,1412.656667,1441.861667
2,서울 강북구,1342.991667,1373.801667,1325.855,1370.8775,1375.390244,1347.2625,1353.985814,
3,서울 강서구,1403.7525,1429.829167,1420.370833,,1377.233636,1485.694,1339.844737,1406.236939
4,서울 관악구,1425.005238,1472.719583,,1435.575417,1439.986667,1466.670278,1420.11625,1453.77875
5,서울 광진구,,1409.874667,1388.13375,1375.815833,1468.312,1414.259167,1372.834231,1376.601667
6,서울 구로구,1349.338333,1580.261111,1348.405,1441.733833,,1417.576415,1365.516522,1422.016774
7,서울 금천구,1382.317059,,1383.742083,1357.099167,,1427.995122,1382.120417,1386.201429
8,서울 노원구,1381.040833,1459.922778,1412.337778,,1407.652917,1509.330833,,1467.67
9,서울 도봉구,1358.41875,1404.916667,1375.605333,1523.638333,1349.851463,1478.19,1388.036452,1344.847742


결측치를 0으로 대체하세요.

In [88]:
# 코드를 입력하세요.
df.fillna(0)

Unnamed: 0,지역,GS칼텍스_셀프,GS칼텍스_일반,S-OIL_셀프,S-OIL_일반,SK에너지_셀프,SK에너지_일반,알뜰(ex)_셀프,알뜰주유소_셀프,알뜰주유소_일반,자가상표_셀프,자가상표_일반,현대오일뱅크_셀프,현대오일뱅크_일반
0,서울 강남구,1479.628286,1647.3725,1459.241304,1641.747917,1452.455417,1790.737444,0.0,0.0,0.0,0.0,0.0,1408.8225,1604.995806
1,서울 강동구,1451.55875,1472.213056,1456.763333,1528.7025,1457.974359,1666.781667,0.0,0.0,0.0,0.0,0.0,1412.656667,1441.861667
2,서울 강북구,1342.991667,1373.801667,1325.855,1370.8775,1375.390244,1347.2625,0.0,0.0,0.0,0.0,0.0,1353.985814,0.0
3,서울 강서구,1403.7525,1429.829167,1420.370833,0.0,1377.233636,1485.694,0.0,1326.212083,1313.838333,0.0,0.0,1339.844737,1406.236939
4,서울 관악구,1425.005238,1472.719583,0.0,1435.575417,1439.986667,1466.670278,0.0,1330.8375,1390.558333,0.0,0.0,1420.11625,1453.77875
5,서울 광진구,0.0,1409.874667,1388.13375,1375.815833,1468.312,1414.259167,0.0,0.0,1354.878333,0.0,0.0,1372.834231,1376.601667
6,서울 구로구,1349.338333,1580.261111,1348.405,1441.733833,0.0,1417.576415,0.0,0.0,1385.675833,0.0,0.0,1365.516522,1422.016774
7,서울 금천구,1382.317059,0.0,1383.742083,1357.099167,0.0,1427.995122,0.0,1304.87,1372.938333,0.0,0.0,1382.120417,1386.201429
8,서울 노원구,1381.040833,1459.922778,1412.337778,0.0,1407.652917,1509.330833,0.0,0.0,0.0,0.0,0.0,0.0,1467.67
9,서울 도봉구,1358.41875,1404.916667,1375.605333,1523.638333,1349.851463,1478.19,0.0,0.0,0.0,0.0,0.0,1388.036452,1344.847742
