### 3-2．データ準備
#### データの読み込み

In [1]:
# Pandasの読み込み
import pandas as pd

# ファイルの読み込み
bank_df = pd.read_csv('bank.csv', sep=',')

# 先頭から5行目まで表示
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no


In [2]:
# データの件数・項目数を確認
print(bank_df.shape)

# データ型を確認
print(bank_df.dtypes)

(7234, 17)
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


### 欠損値の除外

In [3]:
# job、educationに欠損値を含む行を削除
bank_df = bank_df.dropna(subset=['job', 'education'])

# データの件数・項目数を確認
print(bank_df.shape)

(6935, 17)


In [4]:
# 練習問題8
# 欠損値が2400個以上の列を除外
bank_df = bank_df.dropna(thresh=2400, axis=1)

# データの件数・項目数を確認
print(bank_df.shape)

(6935, 16)


### 欠損値の補完

In [5]:
# 欠損値を「unknown」で置換
bank_df = bank_df.fillna({'contact':'unknown'})

# 先頭から5行目まで表示
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,no
1,36,technician,single,secondary,no,265,yes,yes,unknown,5,may,348,1,-1,0,no
2,25,blue-collar,married,secondary,no,-7,yes,no,unknown,5,may,365,1,-1,0,no
3,53,technician,married,secondary,no,-3,no,no,unknown,5,may,1666,1,-1,0,no
4,24,technician,single,secondary,no,-103,yes,yes,unknown,5,may,145,1,-1,0,no


### 外れ値（異常値）の除外

In [6]:
# ageが18歳以上100歳未満のデータ行以外を除外
bank_df = bank_df[bank_df['age'] >= 18]
bank_df = bank_df[bank_df['age'] < 100]

# データの件数・項目数を確認
print(bank_df.shape)

(6933, 16)


### 文字列から数値へ変換

In [7]:
# yesを1、noを0へ置換
bank_df = bank_df.replace('yes', 1)
bank_df = bank_df.replace('no', 0)

# 先頭から5行目まで表示
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,0
1,36,technician,single,secondary,0,265,1,1,unknown,5,may,348,1,-1,0,0
2,25,blue-collar,married,secondary,0,-7,1,0,unknown,5,may,365,1,-1,0,0
3,53,technician,married,secondary,0,-3,0,0,unknown,5,may,1666,1,-1,0,0
4,24,technician,single,secondary,0,-103,1,1,unknown,5,may,145,1,-1,0,0


In [8]:
# jobをダミー変数化
bank_df_job = pd.get_dummies(bank_df['job'])

# 先頭から5行目まで表示
bank_df_job.head()

Unnamed: 0,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed
0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,1,0


In [9]:
# 練習問題9
# maritalをダミー変数化
bank_df_marital = pd.get_dummies(bank_df['marital'])
bank_df_education = pd.get_dummies(bank_df['education'])
bank_df_contact = pd.get_dummies(bank_df['contact'])
bank_df_month = pd.get_dummies(bank_df['month'])

# 先頭から5行目まで表示
bank_df_month.head()

Unnamed: 0,apr,aug,dec,feb,jan,jul,jun,mar,may,nov,oct,sep
0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0


In [10]:
# 元のデータセットから数値項目を抽出
tmp1 = bank_df[['age', 'default', 'balance', 'housing', 'loan',
                'day', 'duration', 'campaign', 'pdays', 'previous', 'y']]

# 先頭から5行目まで表示
tmp1.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,58,0,2143,1,0,5,261,1,-1,0,0
1,36,0,265,1,1,5,348,1,-1,0,0
2,25,0,-7,1,0,5,365,1,-1,0,0
3,53,0,-3,0,0,5,1666,1,-1,0,0
4,24,0,-103,1,1,5,145,1,-1,0,0


In [11]:
# 水平結合
tmp2 = pd.concat([tmp1, bank_df_marital], axis=1)
tmp3 = pd.concat([tmp2, bank_df_education], axis=1)
tmp4 = pd.concat([tmp3, bank_df_contact], axis=1)
bank_df_new = pd.concat([tmp4, bank_df_month], axis=1)

# 先頭から5行目まで表示
bank_df_new.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,dec,feb,jan,jul,jun,mar,may,nov,oct,sep
0,58,0,2143,1,0,5,261,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
1,36,0,265,1,1,5,348,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
2,25,0,-7,1,0,5,365,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,0,-3,0,0,5,1666,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
4,24,0,-103,1,1,5,145,1,-1,0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
# 前処理した結果をCSVファイルで出力
bank_df_new.to_csv('bank-prep.csv', index=False)