In [1]:
import pandas as pd
import numpy as np

### データの読み込み

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32


### データ情報確認

データの列名

In [4]:
df.columns

Index(['Name', 'Sex', 'Age', 'Height', 'Weight', 'Income', 'Register',
       'Login'],
      dtype='object')

データ各列の型

In [5]:
df.dtypes

Name         object
Sex          object
Age           int64
Height      float64
Weight      float64
Income        int64
Register     object
Login        object
dtype: object

データの行数・列数

In [6]:
df.shape

(10, 8)

データの要素数,平均値,標準偏差,最小値,四分位数,最大値を取得

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
Name        10 non-null object
Sex         10 non-null object
Age         10 non-null int64
Height      10 non-null float64
Weight      8 non-null float64
Income      10 non-null int64
Register    10 non-null object
Login       10 non-null object
dtypes: float64(2), int64(2), object(4)
memory usage: 720.0+ bytes


データの統計情報

In [8]:
df.describe()

Unnamed: 0,Age,Height,Weight,Income
count,10.0,10.0,8.0,10.0
mean,43.1,159.27,55.8875,3977.0
std,10.159287,5.277005,12.189508,10036.351761
min,28.0,151.7,42.0,330.0
25%,37.0,154.575,45.675,442.5
50%,44.5,159.45,54.45,695.0
75%,49.0,162.6,65.9,1125.0
max,57.0,166.5,72.9,32500.0


### 欠損値処理

欠損値の要素数を確認

In [9]:
df.isnull().sum()

Name        0
Sex         0
Age         0
Height      0
Weight      2
Income      0
Register    0
Login       0
dtype: int64

欠損値を全て0で補完する

In [10]:
df.fillna(0)

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30
2,C,F,46,158.8,0.0,2130,2008/05/07 21:10,2017/06/04 13:07
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59
9,J,F,36,153.4,0.0,330,1997/03/16 21:04,2018/11/26 05:32


項目’Height’に欠損値を含むレコードの削除  
※subset未指定の場合は、全項目が対象となる

In [11]:
df.dropna(subset=['Height'])

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32


欠損値を全て‘Height’の平均値で補完する

In [12]:
df['Height'].fillna(df['Height'].mean())

0    153.1
1    158.1
2    158.8
3    162.3
4    162.7
5    160.1
6    166.5
7    151.7
8    166.0
9    153.4
Name: Height, dtype: float64

#### KNeighborsClassifierによる欠損値の補完

In [13]:
from sklearn.neighbors import KNeighborsClassifier
df['Weight_cat'] = pd.Categorical(np.floor(df['Weight']/10)*10)
#'Weight'のカテゴリ型化
train = df.dropna(subset=['Weight_cat'], inplace=False)
missing_df = df.loc[df.index.difference(train.index), :]
#trainデータと欠損データに抽出

knn = KNeighborsClassifier(n_neighbors=3)
#knnモデル生成
knn.fit(train[['Age','Height']], train['Weight_cat'])
#trainデータの'Age','Height'からモデル学習
missing_df['Weight_cat'] = knn.predict(missing_df[['Age','Height']])
#推論、補完
missing_df

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login,Weight_cat
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07,40.0
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32,40.0


### データ型変換

#### 数値型

整数型へ変換  
※8ビット(int8),32ビット(int32),64ビット(int64)

In [14]:
df['Height'].astype('int16')

0    153
1    158
2    158
3    162
4    162
5    160
6    166
7    151
8    166
9    153
Name: Height, dtype: int16

浮動小数点型へ変換  
※32ビット(float32),64ビット(float64),128ビット(float128)

In [15]:
df['Height'].astype('float16')

0    153.125
1    158.125
2    158.750
3    162.250
4    162.750
5    160.125
6    166.500
7    151.750
8    166.000
9    153.375
Name: Height, dtype: float16

対数値へ変換

In [16]:
df['Height'].apply(lambda x: np.log10(x))

0    2.184975
1    2.198932
2    2.200850
3    2.210319
4    2.211388
5    2.204391
6    2.221414
7    2.180986
8    2.220108
9    2.185825
Name: Height, dtype: float64

#### カテゴリ型

カテゴリ型に変換

In [17]:
df['Sex'] = pd.Categorical(df['Sex'])

get_dummies関数によるダミー変数化

In [18]:
dummy_vars = pd.get_dummies(df['Sex'])
dummy_vars

Unnamed: 0,F,M
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0
5,1,0
6,0,1
7,1,0
8,0,1
9,1,0


連続データをカテゴリ型に変換

In [19]:
df['Age_cat'] = pd.Categorical(np.floor(df['Age']/10)*10)
df['Age_cat']

0    50.0
1    20.0
2    40.0
3    50.0
4    40.0
5    40.0
6    40.0
7    40.0
8    20.0
9    30.0
Name: Age_cat, dtype: category
Categories (4, float64): [20.0, 30.0, 40.0, 50.0]

カテゴリ型columnの追加と集約

In [20]:
df['Age_cat'].cat.add_categories(['over_40'], inplace=True)

In [21]:
df.loc[df['Age_cat'].isin([40, 50]), 'Age_cat'] = 'over_40'
df

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login,Weight_cat,Age_cat
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33,40.0,over_40
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30,40.0,20
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07,,over_40
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07,60.0,over_40
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48,60.0,over_40
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37,40.0,over_40
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32,70.0,over_40
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15,40.0,over_40
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59,60.0,20
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32,,30


#### 日時型

object型からをdatetime64に変換

DataFrameのreload

In [22]:
df = pd.read_csv('data.csv')

In [23]:
df['Register'] = pd.to_datetime(df['Register'],format='%Y-%m-%d %H:%M:%S')
df['Login'] = pd.to_datetime(df['Login'],format='%Y-%m-%d %H:%M:%S')
df['Register']

0   2011-06-11 19:52:00
1   2009-07-05 11:09:00
2   2008-05-07 21:10:00
3   1999-08-12 03:05:00
4   1997-07-12 01:59:00
5   2013-07-03 22:32:00
6   2001-07-30 01:35:00
7   2001-12-07 08:55:00
8   1994-06-14 23:43:00
9   1997-03-16 21:04:00
Name: Register, dtype: datetime64[ns]

年情報を取得  
日以下は以下の通り  
月	df.dt.month  
日	df.dt.day  
曜日(0:日曜,6:土曜)	df.dt.dayofweek  
時	df.dt.hour  
分	dt.minute  
秒	second  

In [24]:
import datetime
df['Register'].dt.year

0    2011
1    2009
2    2008
3    1999
4    1997
5    2013
6    2001
7    2001
8    1994
9    1997
Name: Register, dtype: int64

年の差を計算

In [25]:
df['Register'].dt.year - df['Login'].dt.year

0    -5
1    -9
2    -9
3   -18
4   -20
5    -5
6   -15
7   -16
8   -22
9   -21
dtype: int64

月の差を計算

In [26]:
(df['Register'].dt.year * 12 + df['Register'].dt.month) - (df['Login'].dt.year * 12 +df['Login'].dt.month)

0    -60
1   -108
2   -109
3   -216
4   -240
5    -65
6   -175
7   -192
8   -263
9   -260
dtype: int64

日の差を計算(時単位以下は、[D]が時:[h],分:[m],秒:[s])

In [27]:
(df['Register'] - df['Login']).astype('timedelta64[D]')

0   -1823.0
1   -3290.0
2   -3315.0
3   -6576.0
4   -7304.0
5   -1980.0
6   -5304.0
7   -5840.0
8   -8004.0
9   -7925.0
dtype: float64

1日加える.その他の時間単位は以下の通り  
(1時間:(hours=1),1分(minutes=1),1秒(seconds=1))

In [28]:
df['Register'] + datetime.timedelta(days=1)

0   2011-06-12 19:52:00
1   2009-07-06 11:09:00
2   2008-05-08 21:10:00
3   1999-08-13 03:05:00
4   1997-07-13 01:59:00
5   2013-07-04 22:32:00
6   2001-07-31 01:35:00
7   2001-12-08 08:55:00
8   1994-06-15 23:43:00
9   1997-03-17 21:04:00
Name: Register, dtype: datetime64[ns]

#### 正規化

DataFrameのreload

In [29]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32


平均0、分散1に変換

In [30]:
from sklearn.preprocessing import StandardScaler
df['Income'] = df['Income'].astype(float)
#float型に変換
ss = StandardScaler()
#オブジェクト生成
income_ss = ss.fit_transform(df[['Income']])
df['Income'] = [x[0] for x in income_ss]
df['Income']

0   -0.380935
1   -0.357829
2   -0.193986
3   -0.347326
4   -0.375683
5   -0.323170
6    2.995698
7   -0.291661
8   -0.342074
9   -0.383035
Name: Income, dtype: float64

DataFrameのreload

In [31]:
df = pd.read_csv('data.csv')

最小値0、最大値1に変換

In [32]:
from sklearn.preprocessing import MinMaxScaler
df['Income'] = df['Income'].astype(float)
#float型に変換
mm = MinMaxScaler()
#オブジェクト生成
income_mm = mm.fit_transform(df[['Income']])
df['Income'] = [x[0] for x in income_mm]
df['Income']

0    0.000622
1    0.007460
2    0.055953
3    0.010569
4    0.002176
5    0.017718
6    1.000000
7    0.027044
8    0.012123
9    0.000000
Name: Income, dtype: float64

DataFrameのreload

In [33]:
df = pd.read_csv('data.csv')

均値から標準偏差の3倍以上離れている要素の削除  
※この場合その要素はありません

In [34]:
df[(abs(df['Income'] - np.mean(df['Income'])) / np.std(df['Income']) <= 3)]

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32


DataFrameのreload

In [35]:
df = pd.read_csv('data.csv')

PCAによる次元圧縮

In [36]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
#オブジェクト生成,
#n_components:主成分分析で変換後の次元数
pca_values = pca.fit_transform(df[['Age', 'Height']])
#'Age'と'Height'を1次元に圧縮
pca_values

array([[-14.62922619],
       [ 14.78565977],
       [ -2.93727202],
       [-11.35599799],
       [  0.58110191],
       [  3.18588552],
       [ -4.8252386 ],
       [ -6.90541351],
       [ 15.8960234 ],
       [  6.20447771]])

### データ分割（K分割交差検証）

In [37]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Name,Sex,Age,Height,Weight,Income,Register,Login
0,A,M,57,153.1,45.7,350,2011/06/11 19:52,2016/06/07 19:33
1,B,M,28,158.1,46.1,570,2009/07/05 11:09,2018/07/08 03:30
2,C,F,46,158.8,,2130,2008/05/07 21:10,2017/06/04 13:07
3,D,M,55,162.3,62.8,670,1999/08/12 03:05,2017/08/12 16:07
4,E,F,43,162.7,65.8,400,1997/07/12 01:59,2017/07/10 22:48
5,F,F,40,160.1,45.6,900,2013/07/03 22:32,2018/12/04 14:37
6,G,M,49,166.5,72.9,32500,2001/07/30 01:35,2016/02/05 06:32
7,H,F,49,151.7,42.0,1200,2001/12/07 08:55,2017/12/02 15:15
8,I,M,28,166.0,66.2,720,1994/06/14 23:43,2016/05/13 17:59
9,J,F,36,153.4,,330,1997/03/16 21:04,2018/11/26 05:32


元データを交差検証用とホールドアウト検証用に分割する  
test_sizeは、ホールドアウト検証用の分割比であり、  
test_size=0.2で元データの20%を割り当てることを意味する

In [38]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target\
    = train_test_split(df.drop('Income', axis=1), df['Income'], test_size=0.2)

分割したデータのインデックスをリセットする

In [39]:
train_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)
train_target.reset_index(inplace=True, drop=True)
test_target.reset_index(inplace=True, drop=True)

交差検証データの行番号リストを作成

In [40]:
row_no_list = list(range(len(train_target)))
row_no_list

[0, 1, 2, 3, 4, 5, 6, 7]

K分割交差検証用のデータ分割の設定  
n_splitsで分割数を指定する  
shuffle=True指定でランダム分割

交差検証用のデータ分割をk回行う

In [41]:
from sklearn.model_selection import KFold
k_fold = KFold(n_splits=4, shuffle=True)
for train_cv_no, test_cv_no in k_fold.split(row_no_list):
    train_cv = train_data.iloc[train_cv_no, :]
    test_cv = train_data.iloc[test_cv_no, :]

In [42]:
train_cv

Unnamed: 0,Name,Sex,Age,Height,Weight,Register,Login
0,J,F,36,153.4,,1997/03/16 21:04,2018/11/26 05:32
2,B,M,28,158.1,46.1,2009/07/05 11:09,2018/07/08 03:30
4,A,M,57,153.1,45.7,2011/06/11 19:52,2016/06/07 19:33
5,H,F,49,151.7,42.0,2001/12/07 08:55,2017/12/02 15:15
6,E,F,43,162.7,65.8,1997/07/12 01:59,2017/07/10 22:48
7,D,M,55,162.3,62.8,1999/08/12 03:05,2017/08/12 16:07


In [43]:
test_cv

Unnamed: 0,Name,Sex,Age,Height,Weight,Register,Login
1,G,M,49,166.5,72.9,2001/07/30 01:35,2016/02/05 06:32
3,I,M,28,166.0,66.2,1994/06/14 23:43,2016/05/13 17:59
