In [None]:
import numpy as np
import pandas as pd

In [None]:
#seriesクラスを生成するためには、Seriesメゾットを使う
series = pd.Series(data=[1,2,3,4,5],index=['A','B','C','D','E'])
series

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [None]:
array = np.arange(1,11)
index = 'a b c d e f g h i j'.split()

series = pd.Series(data=array, index=index)
series

a     1
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
j    10
dtype: int64

In [None]:
index

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

### **Seriesクラスの基本的な操作方法**

データの選択

辞書型のデータのように直接インデックス指定してデータを取り出すこともできるが、多くの場合locメソッドやilocメソッドを使う。
locメソッドはインデックス指定、ilocメソッドはデータの位置を番号に変えて指定する

In [None]:
series = pd.Series(data=[1,2,3,4,5],index=['A','B','C','D','E'])
series

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [None]:
series['A']

1

In [None]:
series['A':'D']

A    1
B    2
C    3
D    4
dtype: int64

 **loc**

In [None]:
series.loc['A']

1

In [None]:
series.loc['A':'D']

A    1
B    2
C    3
D    4
dtype: int64

In [None]:
#飛び飛びでデータを入れたいとき
series.loc[['B','D']]

B    2
D    4
dtype: int64

iloc

In [None]:
series.iloc[1]#Bを１としている

2

In [None]:
series.iloc[:2]

A    1
B    2
dtype: int64

# DataFrameの生成

In [None]:
df = pd.DataFrame(data=[[1,2,3],[4,5,6],[7,8,9]])
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
df = df = pd.DataFrame(data=[[1,2,3],[4,5,6],[7,8,9]],
                       index=['A','B','C'],
                       columns=['C1','C2','C3'])
df

Unnamed: 0,C1,C2,C3
A,1,2,3
B,4,5,6
C,7,8,9


head()メソッドを使うと上の五行だけを取り出せる

In [None]:
from sklearn.datasets import load_iris
iris=load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# データの大まかな内容を理解する

形状確認

In [None]:
iris_df.shape

(150, 4)

各種統計量を確認

In [None]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


データと型を確認

In [None]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


ユニークな値の数を確認

In [None]:
iris_df.nunique()#データの種類が何個あるか　データががぶってない数

sepal length (cm)    35
sepal width (cm)     23
petal length (cm)    43
petal width (cm)     22
dtype: int64

欠損値の数を確認

In [None]:
iris_df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

行名・列名を確認

In [None]:
iris_df.index

RangeIndex(start=0, stop=150, step=1)

In [None]:
iris_df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

# データの選択と抽出
Dataframeクラスの中から狙ったデータを自由に取り出すスキル

In [None]:
np.random.seed(10)
df = pd.DataFrame(data=np.random.randn(5,5),
                  index=['A','B','C','D','E'],
                  columns=['C1','C2','C3','C4','C5']
                  )
df

Unnamed: 0,C1,C2,C3,C4,C5
A,1.331587,0.715279,-1.5454,-0.008384,0.621336
B,-0.720086,0.265512,0.108549,0.004291,-0.1746
C,0.433026,1.203037,-0.965066,1.028274,0.22863
D,0.445138,-1.136602,0.135137,1.484537,-1.079805
E,-1.977728,-1.743372,0.26607,2.384967,1.123691


基本的な選択

In [None]:
df.loc['A']

C1    1.331587
C2    0.715279
C3   -1.545400
C4   -0.008384
C5    0.621336
Name: A, dtype: float64

In [None]:
df.loc[:,'C1']#行は全部、列はC1だけ

A    1.331587
B   -0.720086
C    0.433026
D    0.445138
E   -1.977728
Name: C1, dtype: float64

In [None]:
df.loc['A',['C1','C3']]

C1    1.331587
C3   -1.545400
Name: A, dtype: float64

条件による選択

In [None]:
df>0

Unnamed: 0,C1,C2,C3,C4,C5
A,True,True,False,False,True
B,False,True,True,True,False
C,True,True,False,True,True
D,True,False,True,True,False
E,False,False,True,True,True


In [None]:
df[df>0]

Unnamed: 0,C1,C2,C3,C4,C5
A,1.331587,0.715279,,,0.621336
B,,0.265512,0.108549,0.004291,
C,0.433026,1.203037,,1.028274,0.22863
D,0.445138,,0.135137,1.484537,
E,,,0.26607,2.384967,1.123691


In [None]:
df['C1']>0

A     True
B    False
C     True
D     True
E    False
Name: C1, dtype: bool

In [None]:
df[df['C1']>0]

Unnamed: 0,C1,C2,C3,C4,C5
A,1.331587,0.715279,-1.5454,-0.008384,0.621336
C,0.433026,1.203037,-0.965066,1.028274,0.22863
D,0.445138,-1.136602,0.135137,1.484537,-1.079805


In [None]:
df[ (df['C1']>0) & (df['C1']<1) ]#0より大きく1より小さい

Unnamed: 0,C1,C2,C3,C4,C5
C,0.433026,1.203037,-0.965066,1.028274,0.22863
D,0.445138,-1.136602,0.135137,1.484537,-1.079805


データの追加と削除
>データを削除するにはdropメソッドを使う。
inplaceという引数をTrueにすると、もとのdfが更新され、削除後のdfに置き換わる。

In [None]:
df.drop(columns=['C1'])

Unnamed: 0,C2,C3,C4,C5
A,0.715279,-1.5454,-0.008384,0.621336
B,0.265512,0.108549,0.004291,-0.1746
C,1.203037,-0.965066,1.028274,0.22863
D,-1.136602,0.135137,1.484537,-1.079805
E,-1.743372,0.26607,2.384967,1.123691


In [None]:
df #こうされていない

Unnamed: 0,C1,C2,C3,C4,C5
A,1.331587,0.715279,-1.5454,-0.008384,0.621336
B,-0.720086,0.265512,0.108549,0.004291,-0.1746
C,0.433026,1.203037,-0.965066,1.028274,0.22863
D,0.445138,-1.136602,0.135137,1.484537,-1.079805
E,-1.977728,-1.743372,0.26607,2.384967,1.123691


In [None]:
df.drop(index=['A'],inplace=True)

In [None]:
df#更新されている

Unnamed: 0,C1,C2,C3,C4,C5
B,-0.720086,0.265512,0.108549,0.004291,-0.1746
C,0.433026,1.203037,-0.965066,1.028274,0.22863
D,0.445138,-1.136602,0.135137,1.484537,-1.079805
E,-1.977728,-1.743372,0.26607,2.384967,1.123691


# 欠損値の処理
欠損値を処理する場合drop,fillnaを使うことが多い
・dropna:欠損しているデータを削除
・fillna:欠損値を別の値で埋める

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, B to E
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C1      4 non-null      float64
 1   C2      4 non-null      float64
 2   C3      4 non-null      float64
 3   C4      4 non-null      float64
 4   C5      4 non-null      float64
dtypes: float64(5)
memory usage: 192.0+ bytes


In [None]:
df = pd.DataFrame(data=[[1,2,4,np.nan,4],
                  [5,np.nan,6,np.nan,7],
                  [8,9,10,np.nan,11],
                  [12,np.nan,np.nan,np.nan,13],
                  [14,15,16,17,18]],
                  index=['A','B','C','D','E'],
                  columns=['C1','C2','C3','C4','C5'])
df

Unnamed: 0,C1,C2,C3,C4,C5
A,1,2.0,4.0,,4
B,5,,6.0,,7
C,8,9.0,10.0,,11
D,12,,,,13
E,14,15.0,16.0,17.0,18


In [None]:
df.dropna()#欠損している行を削除

Unnamed: 0,C1,C2,C3,C4,C5
E,14,15.0,16.0,17.0,18


In [None]:
df['C2'].dropna()

A     2.0
C     9.0
E    15.0
Name: C2, dtype: float64

特定の列について欠損値がある場合を削除

In [None]:
df['C2'].isnull()

A    False
B     True
C    False
D     True
E    False
Name: C2, dtype: bool

In [None]:
df[df['C2'].isnull() == False]
#例えば一番重視したい欠損値を無条件に取り除く

Unnamed: 0,C1,C2,C3,C4,C5
A,1,2.0,4.0,,4
C,8,9.0,10.0,,11
E,14,15.0,16.0,17.0,18


欠損値の数を指定して削除

In [None]:
df.dropna(thresh=3)#同じ行で3つ欠損していたら取り除く

Unnamed: 0,C1,C2,C3,C4,C5
A,1,2.0,4.0,,4
B,5,,6.0,,7
C,8,9.0,10.0,,11
E,14,15.0,16.0,17.0,18


In [None]:
df.dropna(thresh=3, axis=1)#列に３つ欠損していたら取り除く

Unnamed: 0,C1,C2,C3,C5
A,1,2.0,4.0,4
B,5,,6.0,7
C,8,9.0,10.0,11
D,12,,,13
E,14,15.0,16.0,18


欠損地を他の値に置換

In [None]:
df['C2'].fillna(df['C2'].mean())#欠損しているところを平均値で置換

A     2.000000
B     8.666667
C     9.000000
D     8.666667
E    15.000000
Name: C2, dtype: float64

In [None]:
df.fillna(df.mean())

Unnamed: 0,C1,C2,C3,C4,C5
A,1,2.0,4.0,17.0,4
B,5,8.666667,6.0,17.0,7
C,8,9.0,10.0,17.0,11
D,12,8.666667,9.0,17.0,13
E,14,15.0,16.0,17.0,18


# カテゴリカルナデータの操作
数学データのみならず、カテゴリカルなデータの操作もしなければならない

In [None]:
df = pd.DataFrame({'C1':['A','A','A','B','B','C',np.nan],
                   'C2':[20,50,60,80,100,30,50],
                   'C3':[40,200,100,500,40,200,40]})
df

Unnamed: 0,C1,C2,C3
0,A,20,40
1,A,50,200
2,A,60,100
3,B,80,500
4,B,100,40
5,C,30,200
6,,50,40


カテゴリーとデータの数を確認

In [None]:
df['C1'].value_counts()

A    3
B    2
C    1
Name: C1, dtype: int64

特定のカテゴリーのデータだけを取り出す

In [None]:
df[df['C1'] == 'A']

Unnamed: 0,C1,C2,C3
0,A,20,40
1,A,50,200
2,A,60,100


カテゴリ変数の欠損値を埋める

In [None]:
df['C1'].fillna(df['C1'].mode()[0])#最頻値を置換

0    A
1    A
2    A
3    B
4    B
5    C
6    A
Name: C1, dtype: object

In [None]:
df#更新されていない

Unnamed: 0,C1,C2,C3
0,A,20,40
1,A,50,200
2,A,60,100
3,B,80,500
4,B,100,40
5,C,30,200
6,,50,40


In [None]:
df['C1']=df['C1'].fillna(df['C1'].mode()[0])

In [None]:
df

Unnamed: 0,C1,C2,C3
0,A,20,40
1,A,50,200
2,A,60,100
3,B,80,500
4,B,100,40
5,C,30,200
6,A,50,40


割合を計算

In [None]:
round(df['C1'].value_counts()/len(df),2)

A    0.57
B    0.29
C    0.14
Name: C1, dtype: float64

グループ化して各種統計量を計算する

In [None]:
df.groupby('C1')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f0e8555d150>

In [None]:
df.groupby('C1').sum()

Unnamed: 0_level_0,C2,C3
C1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,180,380
B,180,540
C,30,200


In [None]:
df.groupby('C1').mean()

Unnamed: 0_level_0,C2,C3
C1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,45,95
B,90,270
C,30,200


In [None]:
df.groupby('C1').max()

Unnamed: 0_level_0,C2,C3
C1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,60,200
B,100,500
C,30,200


#DataFrameの結合
様々あるが、ここではconcatメソッドを使う

In [None]:
df_1= pd.DataFrame(data=np.random.randn(5,5),index=['A','B','C','D','E'],
                   columns=['C1','C2','C3','C4','C5'])
df_2=pd.DataFrame(data=np.random.randn(5,5),index=['F','G','H','I','J'],
                   columns=['C1','C2','C3','C4','C5'])

In [None]:
df_1

Unnamed: 0,C1,C2,C3,C4,C5
A,1.672622,0.099149,1.397996,-0.271248,0.613204
B,-0.267317,-0.549309,0.132708,-0.476142,1.308473
C,0.195013,0.40021,-0.337632,1.256472,-0.73197
D,0.660232,-0.350872,-0.939433,-0.489337,-0.804591
E,-0.212698,-0.33914,0.31217,0.565153,-0.14742


In [None]:
df_2

Unnamed: 0,C1,C2,C3,C4,C5
F,-0.025905,0.289094,-0.539879,0.70816,0.842225
G,0.203581,2.394704,0.917459,-0.112272,-0.36218
H,-0.232182,-0.501729,1.128785,-0.69781,-0.081122
I,-0.529296,1.046183,-1.418556,-0.362499,-0.121906
J,0.319356,0.460903,-0.21579,0.989072,0.314754


In [None]:
pd.concat([df_1,df_2])

Unnamed: 0,C1,C2,C3,C4,C5
A,1.672622,0.099149,1.397996,-0.271248,0.613204
B,-0.267317,-0.549309,0.132708,-0.476142,1.308473
C,0.195013,0.40021,-0.337632,1.256472,-0.73197
D,0.660232,-0.350872,-0.939433,-0.489337,-0.804591
E,-0.212698,-0.33914,0.31217,0.565153,-0.14742
F,-0.025905,0.289094,-0.539879,0.70816,0.842225
G,0.203581,2.394704,0.917459,-0.112272,-0.36218
H,-0.232182,-0.501729,1.128785,-0.69781,-0.081122
I,-0.529296,1.046183,-1.418556,-0.362499,-0.121906
J,0.319356,0.460903,-0.21579,0.989072,0.314754


In [None]:
pd.concat([df_1,df_2],axis=1, sort=True)

Unnamed: 0,C1,C2,C3,C4,C5,C1.1,C2.1,C3.1,C4.1,C5.1
A,1.672622,0.099149,1.397996,-0.271248,0.613204,,,,,
B,-0.267317,-0.549309,0.132708,-0.476142,1.308473,,,,,
C,0.195013,0.40021,-0.337632,1.256472,-0.73197,,,,,
D,0.660232,-0.350872,-0.939433,-0.489337,-0.804591,,,,,
E,-0.212698,-0.33914,0.31217,0.565153,-0.14742,,,,,
F,,,,,,-0.025905,0.289094,-0.539879,0.70816,0.842225
G,,,,,,0.203581,2.394704,0.917459,-0.112272,-0.36218
H,,,,,,-0.232182,-0.501729,1.128785,-0.69781,-0.081122
I,,,,,,-0.529296,1.046183,-1.418556,-0.362499,-0.121906
J,,,,,,0.319356,0.460903,-0.21579,0.989072,0.314754


#関数の適用
特定のデータに関数を適用する場合は、applyメソッドを使うと便利

In [None]:
df= pd.DataFrame(data=np.random.randn(5,5),index=['A','B','C','D','E'],
                   columns=['C1','C2','C3','C4','C5'])
df

Unnamed: 0,C1,C2,C3,C4,C5
A,2.467651,-1.508321,0.620601,-1.045133,-0.798009
B,1.985085,1.744814,-1.856185,-0.222774,-0.065848
C,-2.131712,-0.048831,0.393341,0.217265,-1.994394
D,1.107708,0.244544,-0.061912,-0.753893,0.711959
E,0.918269,-0.482093,0.089588,0.826999,-1.954512


In [None]:
def square(x):
  return x**2

In [None]:
df['C1'].apply(square)

A    6.089302
B    3.940561
C    4.544197
D    1.227018
E    0.843218
Name: C1, dtype: float64

#複数の引数を取る場合
呼び出し時に特定のデータを指定する

In [None]:
def add(x,y):
  return x+y

In [None]:
add(df['C1'],df['C2'])

A    0.959330
B    3.729899
C   -2.180543
D    1.352252
E    0.436176
dtype: float64

In [None]:
def add(df):
  return df['C1']+df['C2']

In [None]:
df.apply(add,axis=1)

A    0.959330
B    3.729899
C   -2.180543
D    1.352252
E    0.436176
dtype: float64

複数の戻り値がある場合

In [None]:
def square_and_cude(x):
  return pd.Series([x**2,x**23])

In [None]:
df[['squared','cuded']]=df['C1'].apply(square_and_cude)
df

Unnamed: 0,C1,C2,C3,C4,C5,squared,cuded
A,2.467651,-1.508321,0.620601,-1.045133,-0.798009,6.089302,1053237000.0
B,1.985085,1.744814,-1.856185,-0.222774,-0.065848,3.940561,7061833.0
C,-2.131712,-0.048831,0.393341,0.217265,-1.994394,4.544197,-36371120.0
D,1.107708,0.244544,-0.061912,-0.753893,0.711959,1.227018,10.51438
E,0.918269,-0.482093,0.089588,0.826999,-1.954512,0.843218,0.1407051
