In [1]:
import pandas as pd

## 예제 3-15 범주형 데이터 생성

In [2]:
s = pd.Series(["a","b","c","a"], dtype="category")

In [3]:
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [4]:
s.shape

(4,)

In [5]:
s.values.categories

Index(['a', 'b', 'c'], dtype='object')

In [6]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
s.dtype

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

In [8]:
try : 
    s[2] = 'd'
except Exception as e :
    print(e)

Cannot setitem on a Categorical with a new category, set the categories first


In [9]:
s[2] = 'b'

In [10]:
try : 
    s[4] = 'c'
except Exception as e :
    print(e)

[4] not contained in the index


In [11]:
s.loc[4] ='c'

In [12]:
s

0    a
1    b
2    b
3    a
4    c
dtype: object

## 예제 3-16 범주형 데이터 클래스 이해하기

In [13]:
cat = pd.Categorical(['a','b','c'])

In [14]:
type(cat)

pandas.core.categorical.Categorical

In [15]:
c = set(dir(cat))

In [16]:
ss = set(dir(pd.Series))

In [17]:
c_d =  c - ss

In [18]:
count = 0 
for i in c_d :
    if not i.startswith("_") :
        count += 1
        print(i, end=", ")
        if count % 5 == 0 :
            print()

codes, check_for_ordered, add_categories, as_unordered, reorder_categories, 
from_codes, is_dtype_equal, ordered, take_nd, categories, 
remove_categories, rename_categories, set_ordered, set_categories, remove_unused_categories, 
as_ordered, 

In [19]:
cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [20]:
cat.dtype

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

In [21]:
s1 = pd.Series(["a","b","c","a"], dtype=cat.dtype)

In [22]:
s1

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

## 예제 3-17 데이터프레임에서 범주형 자료형 처리

In [23]:
import numpy as np

In [24]:
cat_1 = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])

In [25]:
cat_1.dtype

CategoricalDtype(categories=['b', 'a', 'c'], ordered=False)

In [26]:
df = pd.DataFrame({"cat":cat_1, "s":["a", "c", "c", np.nan]})

In [27]:
df

Unnamed: 0,cat,s
0,a,a
1,c,c
2,c,c
3,,


In [28]:
df.select_dtypes(exclude=['object'])

Unnamed: 0,cat
0,a
1,c
2,c
3,


In [29]:
df['cat'][3] = 'b'

In [30]:
df

Unnamed: 0,cat,s
0,a,a
1,c,c
2,c,c
3,b,


In [31]:
df.loc[4,:] = ['a','a']

In [32]:
df

Unnamed: 0,cat,s
0,a,a
1,c,c
2,c,c
3,b,
4,a,a


## 예제 3-18 파일을 읽어서 범주형 자료형으로 변경처리

In [33]:
airplane = pd.read_csv('../data/airplane_info.csv',encoding='cp949')

In [34]:
airplane.head()

Unnamed: 0,기체번호,항공사,기종,정원_F,정원_C,정원_W,정원_Y,총정원
0,VP-BDK,Aeroflot,A320-214,0,20,0,120,140.0
1,VP-BWD,Aeroflot,A320-214,0,20,0,120,140.0
2,VP-BWE,Aeroflot,A320-214,0,20,0,120,140.0
3,VP-BWF,Aeroflot,A320-214,0,20,0,120,140.0
4,VP-BRZ,Aeroflot,A320-214,0,20,0,120,140.0


In [35]:
airplane['항공사'].dtype

dtype('O')

In [36]:
air_cat = pd.Categorical(airplane['항공사'])

In [37]:
air_cat.categories

Index(['AZAL Azerbaijan Airlines', 'Aeroflot', 'Air Arabia', 'Air Astana',
       'Air Baltic', 'Air China', 'Air Kyrgyzstan', 'Air Manas',
       'Airzena Georgian Airways', 'Asiana Airlines', 'AtlasGlobal',
       'Avia Traffic Company', 'Belavia', 'China Southern Airlines',
       'Ellinair', 'Etihad Airways', 'Finnair', 'Globus Airlines',
       'Hainan Airlines', 'Iran Aseman Airlines', 'KLM Royal Dutch Airlines',
       'Kam Air', 'Korean Air', 'LOT - Polish Airlines', 'Lufthansa',
       'MIAT - Monglian Airlines', 'Mahan Air', 'NordStar', 'Pegas Fly',
       'Pegasus Airlines', 'Pobeda', 'Red Wings', 'Rusline', 'S7 Airlines',
       'SCAT', 'Somon Air', 'Sunday Airlines', 'Tajik Air', 'Turkish Airlines',
       'Turkmenistan Airlines', 'Ukraine International Airlines',
       'Ural Airlines', 'Urumqi Air', 'Utair', 'Uzbekistan Airways',
       'VIM Airlines', 'Wizz Air', 'Yakutia Airlines', 'Yamal Airlines',
       'flydubai'],
      dtype='object')

In [38]:
air_cat

[Aeroflot, Aeroflot, Aeroflot, Aeroflot, Aeroflot, ..., Red Wings, Red Wings, Red Wings, Red Wings, Red Wings]
Length: 2178
Categories (50, object): [AZAL Azerbaijan Airlines, Aeroflot, Air Arabia, Air Astana, ..., Wizz Air, Yakutia Airlines, Yamal Airlines, flydubai]

In [39]:
air_cat.shape

(2178,)

In [40]:
airplane['항공사'] = air_cat

In [41]:
airplane['항공사'].values.categories

Index(['AZAL Azerbaijan Airlines', 'Aeroflot', 'Air Arabia', 'Air Astana',
       'Air Baltic', 'Air China', 'Air Kyrgyzstan', 'Air Manas',
       'Airzena Georgian Airways', 'Asiana Airlines', 'AtlasGlobal',
       'Avia Traffic Company', 'Belavia', 'China Southern Airlines',
       'Ellinair', 'Etihad Airways', 'Finnair', 'Globus Airlines',
       'Hainan Airlines', 'Iran Aseman Airlines', 'KLM Royal Dutch Airlines',
       'Kam Air', 'Korean Air', 'LOT - Polish Airlines', 'Lufthansa',
       'MIAT - Monglian Airlines', 'Mahan Air', 'NordStar', 'Pegas Fly',
       'Pegasus Airlines', 'Pobeda', 'Red Wings', 'Rusline', 'S7 Airlines',
       'SCAT', 'Somon Air', 'Sunday Airlines', 'Tajik Air', 'Turkish Airlines',
       'Turkmenistan Airlines', 'Ukraine International Airlines',
       'Ural Airlines', 'Urumqi Air', 'Utair', 'Uzbekistan Airways',
       'VIM Airlines', 'Wizz Air', 'Yakutia Airlines', 'Yamal Airlines',
       'flydubai'],
      dtype='object')

In [42]:
try :
    airplane['항공사'][0] = "Asia Airline"
except Exception as e :
    print(e)

Cannot setitem on a Categorical with a new category, set the categories first


In [43]:
airplane.isnull().sum()

기체번호     0
항공사      0
기종       0
정원_F     0
정원_C     0
정원_W     0
정원_Y     0
총정원     71
dtype: int64

In [44]:
airplane.columns

Index(['기체번호', '항공사', '기종', '정원_F', '정원_C', '정원_W', '정원_Y', '총정원'], dtype='object')

In [45]:
airplane['총정원'].isnull().sum()

71

In [46]:
airplane.loc[airplane['총정원'].isnull(), '총정원']  = airplane.loc[airplane['총정원'].isnull(), '정원_Y']

In [47]:
airplane.isnull().sum()

기체번호    0
항공사     0
기종      0
정원_F    0
정원_C    0
정원_W    0
정원_Y    0
총정원     0
dtype: int64

In [48]:
airplane['총정원'].between(1,120).sum()

235

In [49]:
mask = airplane['총정원'].between(1,120)

In [50]:
airplane['category_R']  = pd.Categorical(np.where(mask, '소형', '중형'))

In [51]:
airplane.head()

Unnamed: 0,기체번호,항공사,기종,정원_F,정원_C,정원_W,정원_Y,총정원,category_R
0,VP-BDK,Aeroflot,A320-214,0,20,0,120,140.0,중형
1,VP-BWD,Aeroflot,A320-214,0,20,0,120,140.0,중형
2,VP-BWE,Aeroflot,A320-214,0,20,0,120,140.0,중형
3,VP-BWF,Aeroflot,A320-214,0,20,0,120,140.0,중형
4,VP-BRZ,Aeroflot,A320-214,0,20,0,120,140.0,중형


In [54]:
airplane['category_R'].value_counts()

중형    1943
소형     235
Name: category_R, dtype: int64

In [53]:
airplane['category_R'].dtype

CategoricalDtype(categories=['소형', '중형'], ordered=False)