# 給与推定コンペ

## ライブラリのインポート

In [1]:
import numpy as np
import pandas as pd
import japanize_matplotlib

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')

## データの読み込み

In [2]:
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

<div align="center">
    <img src="img/column_description.png">
</div>

## 前処理

In [3]:
def convert_area(area):
    if area in ['北海道']:
        return '北海道'
    elif area in ['青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県']:
        return '東北地方'
    elif area in ['茨城県', '栃木県', '群馬県', '埼玉県', '千葉県', '神奈川県']:
        return '関東地方'
    elif area in ['東京都', '大阪府']:
        return '首都圏'
    elif area in ['新潟県', '富山県', '石川県', '福井県', '長野県', '山梨県', '岐阜県', '静岡県', '愛知県']:
        return '中部地方'
    elif area in ['三重県', '滋賀県', '京都府', '兵庫県', '奈良県', '和歌山県']:
        return '近畿地方'
    elif area in ['鳥取県', '島根県', '岡山県', '広島県', '山口県']:
        return '中国地方'
    elif area in ['徳島県', '香川県', '愛媛県', '高知県']:
        return '四国地方'
    elif area in ['福岡県', '佐賀県', '長崎県', '熊本県', '大分県', '宮崎県', '鹿児島県']:
        return '九州地方'
    elif area in ['沖縄県']:
        return '沖縄地方'

In [4]:
train['area'] = train['area'].map(convert_area)
test['area'] = test['area'].map(convert_area)

In [None]:
# def convert_age(age):
#     return int((age - age % 10) / 10)

In [None]:
# train['age'] = train['age'].map(convert_age)
# test['age'] = test['age'].map(convert_age)

In [None]:
# train.loc[train['num_child'] > 0, 'num_child'] = 1
# test.loc[test['num_child'] > 0, 'num_child'] = 1

In [None]:
train.head()

In [None]:
# Label Encoding

# le = preprocessing.LabelEncoder()
# train['area'] = le.fit_transform(train['area'])
# test['area'] = le.transform(test['area'])

In [5]:
# Target Encoding

cat_cols = ['area', 'sex', 'partner', 'education']

for c in cat_cols:
    data_tmp = pd.DataFrame({c: train[c], 'target': train.iloc[:, -1]})
    target_mean = data_tmp.groupby(c)['target'].mean()
    test[c] = test[c].map(target_mean)
    
    tmp = np.repeat(np.nan, train.shape[0])
    
    fold = KFold(n_splits=5, shuffle=True, random_state=0)
    
    for idx_1, idx_2 in fold.split(train):
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        tmp[idx_2] = train[c].iloc[idx_2].map(target_mean)
    
    train[c] = tmp

In [6]:
train.head()

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary
0,0,1,44,362.144259,355.195699,386.400087,2,324.753092,24,2.0,1.6,9.2,428.074887
1,1,2,31,361.168994,366.656122,335.733034,0,310.001389,13,9.0,0.7,12.4,317.930517
2,2,2,36,357.100714,366.356815,335.353884,0,386.12622,14,4.0,0.4,16.9,357.350316
3,3,0,22,405.437042,355.195699,335.353884,0,310.003217,4,3.0,0.4,6.1,201.310911
4,4,0,25,358.22484,355.181718,335.733034,0,325.070476,5,3.0,0.2,4.9,178.067475


## 学習

In [7]:
X = train.iloc[:, 1:-1]
y = train.iloc[:, -1]
X_test = test.iloc[:, 1:]

X.shape, y.shape, X_test.shape

((21000, 11), (21000,), (9000, 11))

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [9]:
train_data = lgb.Dataset(X_train, y_train)
valid_data = lgb.Dataset(X_valid, y_valid)

In [10]:
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric': 'mae',
    'verbose' : 0
}

In [None]:
# categorical_features = [
#     'area', 'sex', 'partner', 'education'
# ]

In [11]:
model = lgb.train(
    params,
    train_data,
    num_boost_round=150,
    valid_sets=valid_data,
#     categorical_feature=categorical_features
)

[1]	valid_0's l1: 127.709
[2]	valid_0's l1: 116.556
[3]	valid_0's l1: 106.437
[4]	valid_0's l1: 97.4416
[5]	valid_0's l1: 89.7031
[6]	valid_0's l1: 82.6555
[7]	valid_0's l1: 76.3411
[8]	valid_0's l1: 70.6503
[9]	valid_0's l1: 65.6553
[10]	valid_0's l1: 60.8874
[11]	valid_0's l1: 56.694
[12]	valid_0's l1: 52.9874
[13]	valid_0's l1: 49.6382
[14]	valid_0's l1: 46.7259
[15]	valid_0's l1: 44.0856
[16]	valid_0's l1: 41.8093
[17]	valid_0's l1: 39.71
[18]	valid_0's l1: 37.8466
[19]	valid_0's l1: 36.2453
[20]	valid_0's l1: 34.8454
[21]	valid_0's l1: 33.6991
[22]	valid_0's l1: 32.6974
[23]	valid_0's l1: 31.729
[24]	valid_0's l1: 30.9183
[25]	valid_0's l1: 30.2215
[26]	valid_0's l1: 29.6007
[27]	valid_0's l1: 29.0995
[28]	valid_0's l1: 28.6081
[29]	valid_0's l1: 28.2115
[30]	valid_0's l1: 27.8063
[31]	valid_0's l1: 27.3738
[32]	valid_0's l1: 27.0556
[33]	valid_0's l1: 26.7482
[34]	valid_0's l1: 26.4185
[35]	valid_0's l1: 26.1922
[36]	valid_0's l1: 25.9801
[37]	valid_0's l1: 25.792
[38]	valid_0's 

In [12]:
y_pred = model.predict(X_test)

## 提出ファイルの作成

In [13]:
submit_df = pd.DataFrame({'y': y_pred})
submit_df.index.name = 'id'
submit_df.to_csv('submission\submission_04_3.csv')

## 結果
- 25.90463