# 데이터 준비/확인
* 평균점수에 따른 등급을 카테고리형 자료로 다루기 위해 평균점수에 따른 등급 컬럼 추가하기

In [1]:
import pandas as pd
df = pd.read_csv('data/scores.csv')
df.head(3)

Unnamed: 0,name,kor,eng,math
0,Aiden,100.0,90.0,95.0
1,Charles,90.0,80.0,75.0
2,Danial,95.0,100.0,100.0


## 결측치 확인/처리하기
* 결측치는 0으로 대치

In [2]:
# 결측치 확인하기 (info)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    30 non-null     object 
 1   kor     27 non-null     float64
 2   eng     28 non-null     float64
 3   math    29 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.1+ KB


In [3]:
# 결측치가 있는 행 삭제하기
df.dropna(inplace=True)

In [4]:
# 결측치 확인(isnull)
df.isnull().sum()

name    0
kor     0
eng     0
math    0
dtype: int64

##  평균 점수 컬럼 추가하기
* 등급을 매기기 위한 평균점수 컬럼 추가

In [5]:
df['average'] = round((df.kor+df.eng+df.math)/3,1)
df.head(1)

Unnamed: 0,name,kor,eng,math,average
0,Aiden,100.0,90.0,95.0,95.0


## 평균점수에 따른 등급 컬럼 추가
* 컬럼.apply(함수)

In [6]:
def get_grade(x):
    if x>=90:
        return 1
    elif x>=80:
        return 2
    elif x>=70:
        return 3
    elif x>=60:
        return 4
    else:
        return 5
    
df['grade'] = df['average'].apply(get_grade)
df.head(1)

Unnamed: 0,name,kor,eng,math,average,grade
0,Aiden,100.0,90.0,95.0,95.0,1


# 카테고리형으로 변환하기
* 등급컬럼(grade)을 카테고리형 자료형으로 변환하기

In [7]:
# 자료형 확인하기
df.dtypes

name        object
kor        float64
eng        float64
math       float64
average    float64
grade        int64
dtype: object

In [8]:
# 자료형 변환하기
df['grade'] = df['grade'].astype('category')

In [9]:
df.dtypes

name         object
kor         float64
eng         float64
math        float64
average     float64
grade      category
dtype: object

In [10]:
df['grade'].dtype

CategoricalDtype(categories=[1, 2, 3, 4], ordered=False)

# 카테고리 이름 바꾸기
* 컬럼.**cat.categories** = 카테고리리스트

In [11]:
df['grade'].cat.categories = ['A','B','C','D']
df

Unnamed: 0,name,kor,eng,math,average,grade
0,Aiden,100.0,90.0,95.0,95.0,A
1,Charles,90.0,80.0,75.0,81.7,B
2,Danial,95.0,100.0,100.0,98.3,A
3,Evan,100.0,100.0,100.0,100.0,A
5,Ian,90.0,100.0,90.0,93.3,A
6,James,70.0,75.0,65.0,70.0,C
7,Julian,80.0,90.0,55.0,75.0,C
8,Justin,50.0,60.0,100.0,70.0,C
9,Kevin,100.0,100.0,90.0,96.7,A
10,Leo,90.0,95.0,70.0,85.0,B


# 누락된 카테고리 추가
* 컬럼.<b>cat.set_categories</b>(카테고리리스트)

In [12]:
df['grade'] = df['grade'].cat.set_categories(['A','B','C','D','F'])
df['grade'].dtypes

CategoricalDtype(categories=['A', 'B', 'C', 'D', 'F'], ordered=False)

# 데이터 용량 확인하기
* titanic 데이터에서 카테고리형으로 관리할 수 있는 자료형을 카테고리형으로 변환하여 데이터 용량 비교하기

## 데이터 준비하고 확인하기

In [13]:
df_titanic = pd.read_csv('data/titanic.csv')
df_titanic.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [14]:
df_titanic['Survived'].unique()

array([0, 1], dtype=int64)

In [15]:
df_titanic['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [16]:
df_titanic['Sex'].unique()

array(['male', 'female'], dtype=object)

In [17]:
df_titanic['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [18]:
# 데이터 타입
df_titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [19]:
# 용량 확인하기
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   int64  
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


## 카테고리형으로 바꾸기
* 컬럼.astype('category')

In [20]:
# 카테고리형으로 바꾸기 (Survived, Pclass, Sex, Embarked)
df_titanic['Survived'] = df_titanic['Survived'].astype('category')
df_titanic['Pclass'] = df_titanic['Pclass'].astype('category')
df_titanic['Sex'] = df_titanic['Sex'].astype('category')
df_titanic['Embarked'] = df_titanic['Embarked'].astype('category')

In [21]:
# 데이터타입
df_titanic.dtypes

PassengerId       int64
Survived       category
Pclass         category
Name             object
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked       category
dtype: object

In [22]:
# 용량
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  1309 non-null   int64   
 1   Survived     1309 non-null   category
 2   Pclass       1309 non-null   category
 3   Name         1309 non-null   object  
 4   Sex          1309 non-null   category
 5   Age          1046 non-null   float64 
 6   SibSp        1309 non-null   int64   
 7   Parch        1309 non-null   int64   
 8   Ticket       1309 non-null   object  
 9   Fare         1308 non-null   float64 
 10  Cabin        295 non-null    object  
 11  Embarked     1307 non-null   category
dtypes: category(4), float64(2), int64(3), object(3)
memory usage: 87.6+ KB
