## 7.9 カテゴリー(分類)データの扱い方

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.notebook_repr_html', False)

df = pd.read_csv('OTg6QzA_activities.csv')
df

                     timestamp   X    Y    Z
0      2022-01-01 00:01:40.363  33   76   56
1      2022-01-01 00:01:42.961  34   87   56
2      2022-01-01 00:01:45.562  25   89   47
3      2022-01-01 00:01:48.163  11   70   50
4      2022-01-01 00:02:08.864  33   72   58
...                        ...  ..  ...  ...
66276  2022-01-31 23:28:00.481 -91  143  136
66277  2022-01-31 23:28:41.683 -89  145  138
66278  2022-01-31 23:28:49.383 -93  138  137
66279  2022-01-31 23:32:36.426 -93  131  137
66280  2022-01-31 23:32:44.126 -91  129  138

[66281 rows x 4 columns]

In [2]:
# (x,y)=(0, 200)を基準に象限を決める
def make_quadrant(df):
  if df['X'] >= 0 and df['Y'] >= 200:
    return "1st"
  elif df['X'] < 0 and df['Y'] >= 200:
    return "2nd"
  elif df['X'] < 0 and df['Y'] < 200:
    return "3rd"
  elif df['X'] >= 0 and df['Y'] < 200:
    return "4th"
  else:
    return "others"

df['quadrant'] = df.apply(make_quadrant, axis=1)
df

                     timestamp   X    Y    Z quadrant
0      2022-01-01 00:01:40.363  33   76   56      4th
1      2022-01-01 00:01:42.961  34   87   56      4th
2      2022-01-01 00:01:45.562  25   89   47      4th
3      2022-01-01 00:01:48.163  11   70   50      4th
4      2022-01-01 00:02:08.864  33   72   58      4th
...                        ...  ..  ...  ...      ...
66276  2022-01-31 23:28:00.481 -91  143  136      3rd
66277  2022-01-31 23:28:41.683 -89  145  138      3rd
66278  2022-01-31 23:28:49.383 -93  138  137      3rd
66279  2022-01-31 23:32:36.426 -93  131  137      3rd
66280  2022-01-31 23:32:44.126 -91  129  138      3rd

[66281 rows x 5 columns]

In [3]:
df['quadrant_cat'] = df['quadrant'].astype('category')

In [4]:
df.dtypes

timestamp         object
X                  int64
Y                  int64
Z                  int64
quadrant          object
quadrant_cat    category
dtype: object

In [5]:
df['quadrant'].nbytes

530248

In [6]:
df['quadrant_cat'].nbytes

66313

In [7]:
df['quadrant_cat'].nbytes/df['quadrant'].nbytes

0.12506034911965722

In [8]:
df['quadrant_cat'].cat.codes

0        3
1        3
2        3
3        3
4        3
        ..
66276    2
66277    2
66278    2
66279    2
66280    2
Length: 66281, dtype: int8

In [9]:
df['quadrant_cat'].cat.categories

Index(['1st', '2nd', '3rd', '4th'], dtype='object')

In [10]:
df['quadrant_cat'].cat.codes.unique()

array([3, 2, 0, 1], dtype=int8)

In [11]:
from enum import Enum
class Quadrant(Enum):
  Q1st = 1
  Q2dn = 2
  Q3rd = 3
  Q4th = 4
  Others = 5

  def __lt__(self, other):
    if self.__class__ is other.__class__:
      return self.value < other.value
    return NotImplemented

for q in Quadrant:
  print(q.name + " : " + str(q.value))

Q1st : 1
Q2dn : 2
Q3rd : 3
Q4th : 4
Others : 5


In [12]:
# (x, y) = (0, 200)を基準に象限を決定する
def make_quadrant_with_enum(df):
  if df['X'] >= 0 and df['Y'] >= 200:
    return Quadrant.Q1st
  elif df['X'] < 0 and df['Y'] >= 200:
    return Quadrant.Q2dn
  elif df['X'] < 0 and df['Y'] < 200:
    return Quadrant.Q3rd
  elif df['X'] >= 0 and df['Y'] < 200:
    return Quadrant.Q4th
  else:
    return Quadrant.Others

df['quadrant_enum'] = df.apply(make_quadrant_with_enum, axis=1)
df

                     timestamp   X    Y    Z quadrant quadrant_cat  \
0      2022-01-01 00:01:40.363  33   76   56      4th          4th   
1      2022-01-01 00:01:42.961  34   87   56      4th          4th   
2      2022-01-01 00:01:45.562  25   89   47      4th          4th   
3      2022-01-01 00:01:48.163  11   70   50      4th          4th   
4      2022-01-01 00:02:08.864  33   72   58      4th          4th   
...                        ...  ..  ...  ...      ...          ...   
66276  2022-01-31 23:28:00.481 -91  143  136      3rd          3rd   
66277  2022-01-31 23:28:41.683 -89  145  138      3rd          3rd   
66278  2022-01-31 23:28:49.383 -93  138  137      3rd          3rd   
66279  2022-01-31 23:32:36.426 -93  131  137      3rd          3rd   
66280  2022-01-31 23:32:44.126 -91  129  138      3rd          3rd   

       quadrant_enum  
0      Quadrant.Q4th  
1      Quadrant.Q4th  
2      Quadrant.Q4th  
3      Quadrant.Q4th  
4      Quadrant.Q4th  
...              ... 

In [13]:
df.dtypes

timestamp          object
X                   int64
Y                   int64
Z                   int64
quadrant           object
quadrant_cat     category
quadrant_enum      object
dtype: object

In [14]:
df['quadrant_enum'].nbytes

530248

In [15]:
df['quadrant_enum'].iloc[0].name

'Q4th'

In [16]:
df['quadrant_enum'].iloc[0].value

4

In [17]:
df.dtypes

timestamp          object
X                   int64
Y                   int64
Z                   int64
quadrant           object
quadrant_cat     category
quadrant_enum      object
dtype: object

In [18]:
%%timeit
df.groupby('quadrant').agg(['count', 'mean', 'median'])





39.4 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)




In [19]:
%%timeit
df.groupby('quadrant_cat').agg(['count', 'mean', 'median'])




42.7 ms ± 664 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)




In [20]:
%%timeit
df.groupby('quadrant_enum').agg(['count', 'mean', 'median'])



55.1 ms ± 614 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


