In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('max_columns', 100)
%matplotlib inline

## 関数の定義

In [2]:
# nanを含む系列を規格化
def standardize_nan(x):
    x_mean = np.nanmean(x)
    x_std = np.nanstd(x)
    return (x - x_mean) / x_std

In [3]:
# sourceからダミー変数を作ってobjに追加
#
# obj : df.DataFrame
# source : df.Series、ダミー変数化したい系列
# drop_unknown : ダミー変数化したとき"unknown"を外すか
def add_dummies(obj, source, drop_unknown=True):
    tmp = obj.copy()
    col_name = source.name
    dummies = pd.get_dummies(source, prefix=col_name)
    name_unknown = col_name + '_unknown'
    if drop_unknown == True:
        if name_unknown in dummies.columns:
            dummies.drop([name_unknown], axis=1, inplace=True)
    return pd.concat([tmp, dummies], axis=1, sort=False)

In [4]:
# yes/no の系列(source)をマッピングしてobjに追加
def add_yn_map(obj, source):
    yn = source.map({'yes':1, 'no':0})
    return pd.concat([obj, yn], axis=1, sort=False)

## データ読み込み

In [5]:
# 元データ
df_train_raw = pd.read_csv('../input/train.csv')
df_test_raw = pd.read_csv('../input/test.csv')
# 編集用にコピー
df_train = df_train_raw.copy()
df_test = df_test_raw.copy()
# trainデータの長さを保存
train_len = len(df_train)

## 外れ値削除

In [6]:
# 外れ値の行を取り出す
outliers = df_train[df_train.balance>80000]
outliers = pd.concat([outliers, df_train[df_train.duration>4000]], axis=0)
outliers = pd.concat([outliers, df_train[df_train.previous>100]], axis=0)

# outliers にある行を df_train から削除
df_train = df_train.drop(outliers.index, axis=0)
# trainデータの長さを保存
train_len = len(df_train)

df_train.to_csv('./trian_without_outliers.csv', index=False)

## データ整形

In [7]:
# まとめて処理するために df_train, df_test を結合
df = pd.concat([df_train, df_test], axis=0, ignore_index=True, sort=False) # indexは振り直される
#df_new = pd.DataFrame(df['y']).copy()

### 連続値のデータ整形

- age : 年齢

In [8]:
df['age_log'] = np.log(df.age)
df['age_log_std'] = standardize_nan(df['age_log'])

- balance : 年間平均残高

In [9]:
# balanceをプラスのみ、マイナスのみに分離(nanを含む)
df['balance_p_nan'] = df['balance'].where(df.balance>0, np.nan)
df['balance_m_nan'] = df['balance'].where(df.balance<0, np.nan)
# それぞれ log をとる
df['balance_p_log_nan'] = np.log(df['balance_p_nan'])
df['balance_m_log_nan'] = np.log(-df['balance_m_nan'])
# とりあえずnanを含んだまま正規化
df['balance_p_log_std_nan'] = standardize_nan(df['balance_p_log_nan'])
df['balance_m_log_std_nan'] = standardize_nan(df['balance_m_log_nan'])

# balanceの符号ごとにマッピング(-1, 0, +1)
df['balance_sign'] = np.sign(df['balance'])

- day : 最終接触日  
外す

- month : 最終接触月

In [10]:
# 月を数字に直す
df['month'] = df['month'].map({'jan':1,
                       'feb':2,
                       'mar':3,
                       'apr':4,
                       'may':5,
                       'jun':6,
                       'jul':7,
                       'aug':8,
                       'sep':9,
                       'oct':10,
                       'nov':11,
                       'dec':12
                      }).astype(int)

In [11]:
# 1月1日→1、12月31日→365 になるように日付をつける

#1月:0、2月:31、3月:(31+28)、4月:(31+28+31)、 ...
day_sum = pd.Series(np.cumsum([0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30]),index=np.arange(1,13))
df['date'] = (df['month'].map(day_sum) + df['day']).astype(int)

In [12]:
# 曜日ごとに分離
df['week'] = df['date'] % 7

- duration : 最終接触時間（秒）

In [13]:
# logをとって正規化
df['duration_log'] = np.log1p(df['duration'])
df['duration_log_std'] = standardize_nan(df['duration_log'])
# (1を足したものの)逆数
df['duration_inv'] = 1/(df['duration'] + 1)

- campaign : 現キャンペーンにおける接触回数

In [14]:
df['campaign_log'] = np.log(df['campaign'])

- pdays : 経過日数：前キャンペーン接触後の日数

In [33]:
df['pdays_2'] = df['pdays'].where((df.pdays<100), 1)
df['pdays_2'] = df['pdays_2'].where((df.pdays<0) | (df.pdays>=100), 0)

In [36]:
# pdays = -1は前回接触が無かった人  
# 大きな数にした方が自然。最大値より大きい1000にする
df['pdays'] = df['pdays'].where(df.pdays!=-1, 1000)

#いったん1000は外してlog→正規化
df['pdays_nan'] = df['pdays'].where(df.pdays!=1000, np.nan)
df['pdays_log_nan'] = np.log(df['pdays_nan'])
df['pdays_log_std_nan'] = standardize_nan(df['pdays_log_nan'])

- previous : 接触実績：現キャンペーン以前までに顧客に接触した回数

In [37]:
# 0を外してlogとる
df['previous_nan'] = df['previous'].where(df.previous!=0, np.nan)
df['previous_log_nan'] = np.log(df['previous_nan'])

* その他、新しい量

In [38]:
# 総接触回数
df['total_touch'] = df['previous'] + df['campaign']

In [39]:
df['total_touch_log'] = np.log(df['total_touch'])

In [40]:
#接触頻度っぽい量？
df['freq'] = df['total_touch'] / df['pdays']

In [41]:
# 前回接触した日付(この段階ではnanを含む)
df['previous_date_nan'] = df['date'] - df['pdays_nan']

In [42]:
# educationを数値化(1〜4)
df['education_num'] = df['education'].map({'primary':1, 'secondary':2, 'tertiary':3, 'unknown':2})
# 現在学生なら 4
#df['education_num'] = df['education_num'].where(df['job']!='student', 4)

In [43]:
df.to_csv('./df_with_nan.csv', index=False)

### 適当にnanを埋める

In [44]:
df['balance_p_log_std'] = df['balance_p_log_std_nan'].fillna(-10)# 負の遠方
df['balance_m_log_std'] = df['balance_m_log_std_nan'].fillna(-10)# 負の遠方
df['pdays_log_std'] = df['pdays_log_std_nan'].fillna(5)# 正の遠方
df['previous_log'] = df['previous_log_nan'].fillna(-1)# 負の後方
df['previous_date'] = df['previous_date_nan'].fillna(-1000)# 負の遠方

In [45]:
df.to_csv('./df_without_nan.csv', index=False)

### データ選択

In [47]:
# 使うデータ(連続値)を選択して df_numにまとめる
df_num = df[['month', # 最終接触月(1〜12)
             'date', # 最終接触日? (1〜365)
             'week', # dateの週
             'age_log_std', # ageのlog→正規化
             'duration_log_std', # durationのlog→正規化
             'duration_inv', # durationの逆数
             'campaign_log', 
             'balance_p_log_std',
             'balance_m_log_std',
             'balance_sign', 
             'pdays',
             'pdays_2',
             'pdays_log_std',
             'previous_date', # 前回の接触日
             'previous_log',
             'total_touch', # 前回と今回の接触回数
             'freq', # 接触頻度?
             'education_num',
             ]].copy()

## 離散値

- job - 職種
- marital : 未婚/既婚
- education : 教育水準
- default : 債務不履行があるか yes/no
- housing : 住宅ローン yes/no
- loan : 個人ローン yes/no
- poutcome : 前回のキャンペーンの成果
- contact : 連絡方法

In [48]:
# df_objにまとめていく↓
df_obj = df['y']
# ダミー変数化
df_obj = add_dummies(df_obj, df.job)
df_obj = add_dummies(df_obj, df.marital)
df_obj = add_dummies(df_obj, df.education)
df_obj = add_dummies(df_obj, df.month)
df_obj = add_dummies(df_obj, df.poutcome)
df_obj = add_dummies(df_obj, df.contact, drop_unknown=False) # contactではunknownに有意な差
# yes/noをマッピング
df_obj = add_yn_map(df_obj, df.default)
df_obj = add_yn_map(df_obj, df.housing)
df_obj = add_yn_map(df_obj, df.loan)

In [49]:
df_obj.drop(['y'], axis=1, inplace=True)

## csvに出力

In [50]:
ser_y = pd.Series(df.y) # yの系列
# df_num、df_obj と y をまとめる
df_new = pd.concat([df_num, df_obj, ser_y], axis=1, sort=False)
df_new[:train_len].to_csv('df_train_for_fit.csv', index=False)
df_new[train_len:].to_csv('df_test_for_fit.csv', index=False)

In [51]:
df_new[:train_len].to_csv('X.csv', index=False)
df_new[train_len:].to_csv('test_X.csv', index=False)

In [52]:
df_new.columns

Index(['month', 'date', 'week', 'age_log_std', 'duration_log_std',
       'duration_inv', 'campaign_log', 'balance_p_log_std',
       'balance_m_log_std', 'balance_sign', 'pdays', 'pdays_2',
       'pdays_log_std', 'previous_date', 'previous_log', 'total_touch', 'freq',
       'education_num', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'contact_cellular', 'contact_telephone', 'contact_unknown', 'default',
       'housing', 'loan', 'y'],
      dtype='object')