In [1]:
import pandas as pd
import numpy as np

## 그룹화하기

In [7]:
help(pd.cut)

Help on function cut in module pandas.core.reshape.tile:

cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False)
    Return indices of half-open bins to which each value of `x` belongs.
    
    Parameters
    ----------
    x : array-like
        Input array to be binned. It has to be 1-dimensional.
    bins : int, sequence of scalars, or IntervalIndex
        If `bins` is an int, it defines the number of equal-width bins in the
        range of `x`. However, in this case, the range of `x` is extended
        by .1% on each side to include the min or max values of `x`. If
        `bins` is a sequence it defines the bin edges allowing for
        non-uniform bin width. No extension of the range of `x` is done in
        this case.
    right : bool, optional
        Indicates whether the bins include the rightmost edge or not. If
        right == True (the default), then the bins [1,2,3,4] indicate
        (1,2], (2,3], (3,4].
    labels : array or boole

### 특정 값과 범주를 지정한다

In [2]:
ages = [ 20,22,25,21,27,21,23,37,31,61,45,41,32]

In [3]:
bins = [18,25,35,60,100]

In [17]:
cat = pd.cut(ages,bins)

In [5]:
cat

[(18, 25], (18, 25], (18, 25], (18, 25], (25, 35], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 13
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

### 좌측편에 포함관계를 나타내도록 표시하기 위해서는 right는 False로 지정한다.

In [14]:
cat_rights = pd.cut(ages,bins, right=False)

In [15]:
cat_rights

[[18, 25), [18, 25), [25, 35), [18, 25), [25, 35), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 13
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

### labels는 codes로 변경

In [8]:
cat.labels

  """Entry point for launching an IPython kernel.


array([0, 0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [9]:
cat.codes

array([0, 0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

### levels는 categories로 변경

    괄호는 포함하지 않고 대괄호는 포함관계를 표시한다.
    

In [11]:
cat.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

### 범주별로 원소 개수 확인하기

In [13]:
pd.value_counts(cat)

(18, 25]     6
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

## 범주를 부여해서 처리하기

In [27]:
labels = ['youth','youngadult','middleaged', 'Senior']
cat_labels = pd.cut(ages,bins,labels=labels)

In [28]:
cat_labels

[youth, youth, youth, youth, youngadult, ..., youngadult, Senior, middleaged, middleaged, youngadult]
Length: 13
Categories (4, object): [Senior < middleaged < youngadult < youth]

In [29]:
cat_labels.categories

Index(['Senior', 'middleaged', 'youngadult', 'youth'], dtype='object')

### retbins=True를 지정하면 반환결과가 2개이다

In [23]:
a,b = pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)

In [24]:
type(a)

pandas.core.categorical.Categorical

In [25]:
type(b)

numpy.ndarray

### 균등분포로 나누기

In [31]:
normal = pd.cut(np.random.rand(20),4,precision=2)

In [32]:
pd.value_counts(normal)

(0.5, 0.72]      7
(0.053, 0.28]    6
(0.72, 0.94]     4
(0.28, 0.5]      3
dtype: int64

### 위치를 부여해서 처리하기 

In [33]:
help(pd.qcut)

Help on function qcut in module pandas.core.reshape.tile:

qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce a Categorical object indicating
    quantile membership for each data point.
    
    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins.
    retbins : bool, optional
        Whether to return the (bins, labels) or not. Can be useful if bins
        is given as a scalar.
    

### 세분화한 후에 이를 describe로 확인하면 범주별로 차지한 빈도를 표시

In [40]:
r = pd.qcut(range(5), 3, labels=["good","medium","bad"])

In [42]:
r.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
bad,2,0.4
good,2,0.4
medium,1,0.2


In [37]:
q = pd.qcut(np.random.randn(1000),[0,0.1,0.5,0.9,1])

In [38]:
pd.value_counts(q)

(0.0415, 1.312]     400
(-1.275, 0.0415]    400
(1.312, 3.396]      100
(-3.355, -1.275]    100
dtype: int64

In [39]:
q.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-3.355, -1.275]",100,0.1
"(-1.275, 0.0415]",400,0.4
"(0.0415, 1.312]",400,0.4
"(1.312, 3.396]",100,0.1
