In [1]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

### Pandasでデータを取り出す方法

In [2]:
dframe = DataFrame(np.arange(25).reshape((5,5)),index=['NYC','LA','SF','DC','Chi'],columns=['A','B','C','D','E'])
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [3]:
dframe['B']

NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int64

In [4]:
dframe[['B', 'E']]

Unnamed: 0,B,E
NYC,1,4
LA,6,9
SF,11,14
DC,16,19
Chi,21,24


In [5]:
dframe.iloc[0]

A    0
B    1
C    2
D    3
E    4
Name: NYC, dtype: int64

In [19]:
dframe.iloc[:,:3]

Unnamed: 0,A,B,C
NYC,0,1,2
LA,5,6,7
SF,10,11,12
DC,15,16,17
Chi,20,21,22


In [18]:
dframe.loc[:,['A','C']]

Unnamed: 0,A,C
NYC,0,2
LA,5,7
SF,10,12
DC,15,17
Chi,20,22


In [20]:
dframe.loc[['NYC', 'DC'], :]

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
DC,15,16,17,18,19


In [24]:
dframe[dframe['C'] > 10]

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [25]:
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


### DataFrameの結合

In [55]:
df1 = DataFrame(np.arange(9).reshape(3,3),columns=list('ABC'))
df2 = DataFrame(np.arange(9).reshape(3,3),columns=list('XYZ'))

In [56]:
df3 = df1 + df2

In [57]:
df3

Unnamed: 0,A,B,C,X,Y,Z
0,,,,,,
1,,,,,,
2,,,,,,


In [58]:
pd.concat([df1,df2], axis=1)

Unnamed: 0,A,B,C,X,Y,Z
0,0,1,2,0,1,2
1,3,4,5,3,4,5
2,6,7,8,6,7,8


In [64]:
df3 = DataFrame(np.arange(9).reshape(3,3),columns=list('ABC'),index=['a','b','c'])
df4 = DataFrame(np.arange(9).reshape(3,3),columns=list('ABC'),index=['x','y','z'])

In [65]:
pd.concat([df3,df4], axis=0)

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
x,0,1,2
y,3,4,5
z,6,7,8


In [68]:
pd.concat([df3,df4], axis=0, ignore_index=True)

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,0,1,2
4,3,4,5
5,6,7,8


In [69]:
# いくつかサンプルになるデータを作ります。
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],
           index=['Q','R','S','T','U','V'])
# 長さを同じにします。
ser2 = Series(np.arange(len(ser1), dtype=np.float64),
           index=['Q','R','S','T','U','V'])

In [70]:
ser1

Q    2.0
R    NaN
S    4.0
T    NaN
U    6.0
V    NaN
dtype: float64

In [71]:
ser2

Q    0.0
R    1.0
S    2.0
T    3.0
U    4.0
V    5.0
dtype: float64

In [72]:
ser1.combine_first(ser2)

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    5.0
dtype: float64

### 行・列の削除

In [73]:
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [75]:
dframe.drop("LA")

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [77]:
dframe.drop(['LA', 'NYC'])

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [78]:
dframe.drop(['B', 'E'], axis=1)

Unnamed: 0,A,C,D
NYC,0,2,3
LA,5,7,8
SF,10,12,13
DC,15,17,18
Chi,20,22,23


### 並べ替え

In [91]:
df3 = DataFrame(np.random.randn(3,3),columns=list('ABC'),index=['b','a','c'])

In [92]:
df3

Unnamed: 0,A,B,C
b,1.397619,1.356115,-0.593621
a,0.063988,1.018302,-0.191539
c,-1.378215,0.334447,0.011485


In [93]:
df3.sort_index(ascending=False)

Unnamed: 0,A,B,C
c,-1.378215,0.334447,0.011485
b,1.397619,1.356115,-0.593621
a,0.063988,1.018302,-0.191539


In [94]:
df3.sort_index()

Unnamed: 0,A,B,C
a,0.063988,1.018302,-0.191539
b,1.397619,1.356115,-0.593621
c,-1.378215,0.334447,0.011485


In [95]:
df3.sort_values(by='B')

Unnamed: 0,A,B,C
c,-1.378215,0.334447,0.011485
a,0.063988,1.018302,-0.191539
b,1.397619,1.356115,-0.593621


In [96]:
df3.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C
b,1.397619,1.356115,-0.593621
a,0.063988,1.018302,-0.191539
c,-1.378215,0.334447,0.011485


In [98]:
from sklearn.datasets import load_boston
boston = load_boston()

In [107]:
df = DataFrame(boston.data)
df.columns = boston.feature_names

In [109]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.593761,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.596783,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.647423,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [110]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      