# pandas-cheatsheet-数値の範囲でまとめる(cut)

## 数値の範囲で分類する
* http://ailaby.com/cut_qcut/

In [8]:
import numpy as np
data = np.random.randint(1, 11, 6)
data

array([10,  2,  2,  2,  1,  6])

In [9]:
import pandas as pd
pd.cut(data, [0, 3, 7, 10])

[(7, 10], (0, 3], (0, 3], (0, 3], (0, 3], (3, 7]]
Categories (3, object): [(0, 3] < (3, 7] < (7, 10]]

* ” ( ” は下限値を含まない、” ] ” は上限値を含むという意味です。
    * (0, 3] は 1～3 を意味します。
    * (3, 7] は 4～7 を意味します。
    * (7, 10] は 8～10 を意味します。

In [10]:
labels=['Low', 'Middle', 'High']
pd.cut(data, [0, 3, 7, 10], labels=labels).value_counts()

Low       4
Middle    1
High      1
dtype: int64

* [pandas で年齢階級をつくる - Qiita](https://qiita.com/kshigeru/items/bfa8c11d1e6487c791d3)

In [12]:
pwd

'G:\\Repository'

In [71]:
import pandas as pd
df = pd.read_csv('data/y0207000.csv')
del df['Unnamed: 0']
df.head()

Unnamed: 0,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,1877543,1952306,1998647,2315990,1576913,1877521,1586504,1213685,1171652,1045975
1,1392988,1799131,1674304,2522681,1594841,1862600,1636235,1260478,1166160,1045417
2,1387685,1749485,1769555,2479988,1549114,1818359,1706055,1301517,1192157,1074194
3,1391499,1773731,1831578,2346977,1513111,1822747,1749163,1343438,1189303,1069540
4,1408000,1736482,1853925,1539821,1610454,1424592,1837459,1373779,1184826,1061622


In [72]:
df.tail()

Unnamed: 0,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
96,459,503,548,686,715,1612,3595,9943,29798,77372
97,430,565,316,255,339,982,2305,6348,21687,55845
98,302,546,205,151,198,555,1334,3962,14648,39826
99,44,450,158,90,121,297,828,2356,9926,26087
100,113,105,187,97,144,308,989,3223,12256,43882


In [73]:
df.dtypes

1920    object
1930    object
1940    object
1950    object
1960    object
1970    object
1980    object
1990    object
2000    object
2010    object
dtype: object

* [Python pandasでカンマタブ混じりのcsvファイルを読み込み整形する - Qiita](https://qiita.com/sfujita99/items/23077362b26b887a842f)

In [74]:
df.shape

(101, 10)

In [75]:
df[df.columns[0]][:3]

0    1,877,543
1    1,392,988
2    1,387,685
Name: 1920, dtype: object

## カンマを取り除き、整数に変換する
* [Python pandasでカンマタブ混じりのcsvファイルを読み込み整形する - Qiita](https://qiita.com/sfujita99/items/23077362b26b887a842f)
    * 一部修正
    ```
    for i in range(df.shape[1]):
        df[i] = df[df.columns[i]].astype(float)
    ```
* [<Python, pandas> 文字列の置換 - ねこゆきのメモ](http://nekoyukimmm.hatenablog.com/entry/2016/11/11/144255)
    * df['column'].str.replace('','')を使う。

In [77]:
for i in range(df.shape[1]):
    df[df.columns[i]] = df[df.columns[i]].str.replace(',', '').astype(int)

In [68]:
df.head()

Unnamed: 0,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,1877543,1952306,1998647,2315990,1576913,1877521,1586504,1213685,1171652,1045975
1,1392988,1799131,1674304,2522681,1594841,1862600,1636235,1260478,1166160,1045417
2,1387685,1749485,1769555,2479988,1549114,1818359,1706055,1301517,1192157,1074194
3,1391499,1773731,1831578,2346977,1513111,1822747,1749163,1343438,1189303,1069540
4,1408000,1736482,1853925,1539821,1610454,1424592,1837459,1373779,1184826,1061622


In [82]:
df.dtypes

1920    int32
1930    int32
1940    int32
1950    int32
1960    int32
1970    int32
1980    int32
1990    int32
2000    int32
2010    int32
dtype: object

In [79]:
df.columns

Index(['1920', '1930', '1940', '1950', '1960', '1970', '1980', '1990', '2000',
       '2010'],
      dtype='object')

## カンマが取り除かれていれば、ここで変換してもいい。

In [81]:
df[df.columns] = df[df.columns].astype(int)

In [83]:
import numpy as np

labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ]
print(labels)
c = pd.cut(df.index, np.arange(0, 101, 10),
           include_lowest=True, right=False,
           labels=labels)
c

['0 - 9', '10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90 - 99']


[0 - 9, 0 - 9, 0 - 9, 0 - 9, 0 - 9, ..., 90 - 99, 90 - 99, 90 - 99, 90 - 99, NaN]
Length: 101
Categories (10, object): [0 - 9 < 10 - 19 < 20 - 29 < 30 - 39 ... 60 - 69 < 70 - 79 < 80 - 89 < 90 - 99]

100のところが...

* 以下のようにlabelsのところを***max_num + interval***にして、cutのところを___max_num + interval + 1___にすればうまくいく。

In [84]:
max_num = 100
interval = 10
labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, max_num + interval, 10) ]
print(labels)
c = pd.cut(df.index, np.arange(0, max_num + interval + 1, 10),
           include_lowest=True, right=False,
           labels=labels)
c

['0 - 9', '10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90 - 99', '100 - 109']


[0 - 9, 0 - 9, 0 - 9, 0 - 9, 0 - 9, ..., 90 - 99, 90 - 99, 90 - 99, 90 - 99, 100 - 109]
Length: 101
Categories (11, object): [0 - 9 < 10 - 19 < 20 - 29 < 30 - 39 ... 70 - 79 < 80 - 89 < 90 - 99 < 100 - 109]

In [86]:
df.groupby(c).sum()

Unnamed: 0,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0 - 9,14314635,16778220,17961607,20728122,17049068,16965066,18547450,13959454,11925887,10882409
10 - 19,11520624,13340649,15816378,17267585,20326076,16921989,17231873,18533872,14034777,11984392
20 - 29,8533259,10367140,11756837,13910662,16527810,19749434,16882381,16870834,18211769,13720134
30 - 39,7020188,7798498,9370143,10250310,13555835,16578939,19973312,16791465,16891475,18127846
40 - 49,5902331,6332741,7041270,8487529,9835689,13217564,16427887,19676302,16716227,16774981
50 - 59,4074855,5046797,5446760,6137697,7842597,9230197,12813527,15813274,19176162,16308233
60 - 69,2968342,2977915,3782574,4074610,5092019,6709761,8429928,11848590,14841772,18247422
70 - 79,1378630,1478319,1541314,1967261,2518482,3401952,5059662,6835747,10051176,12904315
80 - 89,236419,315624,338472,354836,638738,879221,1503633,2665908,4147012,6768852
90 - 99,13657,13997,18567,16258,32043,65629,118391,286141,688769,1318463
