# 給与推定コンペ

## ライブラリのインポート

In [1]:
import numpy as np
import pandas as pd
import japanize_matplotlib

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import lightgbm as lgb

## データの読み込み

In [2]:
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

<div align="center">
    <img src="img/column_description.png">
</div>

## 前処理

In [3]:
def convert_area(area):
    if area in ['北海道']:
        return '北海道'
    elif area in ['青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県']:
        return '東北地方'
    elif area in ['茨城県', '栃木県', '群馬県', '埼玉県', '千葉県', '神奈川県']:
        return '関東地方'
    elif area in ['東京都', '大阪府']:
        return '首都圏'
    elif area in ['新潟県', '富山県', '石川県', '福井県', '長野県', '山梨県', '岐阜県', '静岡県', '愛知県']:
        return '中部地方'
    elif area in ['三重県', '滋賀県', '京都府', '兵庫県', '奈良県', '和歌山県']:
        return '近畿地方'
    elif area in ['鳥取県', '島根県', '岡山県', '広島県', '山口県']:
        return '中国地方'
    elif area in ['徳島県', '香川県', '愛媛県', '高知県']:
        return '四国地方'
    elif area in ['福岡県', '佐賀県', '長崎県', '熊本県', '大分県', '宮崎県', '鹿児島県']:
        return '九州地方'
    elif area in ['沖縄県']:
        return '沖縄地方'

In [4]:
train['area'] = train['area'].map(convert_area)
test['area'] = test['area'].map(convert_area)

In [5]:
# def convert_age(age):
#     return int((age - age % 10) / 10)

In [6]:
# train['age'] = train['age'].map(convert_age)
# test['age'] = test['age'].map(convert_age)

In [7]:
# train.loc[train['num_child'] > 0, 'num_child'] = 1
# test.loc[test['num_child'] > 0, 'num_child'] = 1

In [8]:
train.head()

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary
0,0,1,44,中部地方,2,1,1,1,24,2.0,1.6,9.2,428.074887
1,1,2,31,近畿地方,1,0,0,0,13,9.0,0.7,12.4,317.930517
2,2,2,36,中国地方,1,0,0,2,14,4.0,0.4,16.9,357.350316
3,3,0,22,首都圏,2,0,0,0,4,3.0,0.4,6.1,201.310911
4,4,0,25,九州地方,2,0,0,1,5,3.0,0.2,4.9,178.067475


In [9]:
# Label Encoding

le = preprocessing.LabelEncoder()
train['area'] = le.fit_transform(train['area'])
test['area'] = le.transform(test['area'])

## 学習

In [10]:
X = train.iloc[:, 1:-1]
y = train.iloc[:, -1]
X_test = test.iloc[:, 1:]

X.shape, y.shape, X_test.shape

((21000, 11), (21000,), (9000, 11))

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [12]:
train_data = lgb.Dataset(X_train, y_train)
valid_data = lgb.Dataset(X_valid, y_valid)

In [13]:
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric': 'mae',
    'verbose' : 0
}

In [14]:
categorical_features = [
    'area', 'sex', 'partner', 'education'
]

In [15]:
model = lgb.train(
    params,
    train_data,
    num_boost_round=150,
    valid_sets=valid_data,
    categorical_feature=categorical_features
)

New categorical_feature is ['area', 'education', 'partner', 'sex']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's l1: 127.674
[2]	valid_0's l1: 116.617
[3]	valid_0's l1: 106.475
[4]	valid_0's l1: 97.5562
[5]	valid_0's l1: 89.574
[6]	valid_0's l1: 82.6246
[7]	valid_0's l1: 76.4327
[8]	valid_0's l1: 70.8402
[9]	valid_0's l1: 65.9393
[10]	valid_0's l1: 61.6793
[11]	valid_0's l1: 57.8982
[12]	valid_0's l1: 54.1776
[13]	valid_0's l1: 51.1724
[14]	valid_0's l1: 48.2332
[15]	valid_0's l1: 45.7678
[16]	valid_0's l1: 43.4506
[17]	valid_0's l1: 41.4503
[18]	valid_0's l1: 39.7452
[19]	valid_0's l1: 38.1965
[20]	valid_0's l1: 36.8886
[21]	valid_0's l1: 35.6968
[22]	valid_0's l1: 34.6884
[23]	valid_0's l1: 33.7735
[24]	valid_0's l1: 33.0368
[25]	valid_0's l1: 32.3944
[26]	valid_0's l1: 31.8068
[27]	valid_0's l1: 31.2875
[28]	valid_0's l1: 30.8599
[29]	valid_0's l1: 30.4729
[30]	valid_0's l1: 30.1082
[31]	valid_0's l1: 29.7914
[32]	valid_0's l1: 29.5039
[33]	valid_0's l1: 29.2553
[34]	valid_0's l1: 29.0339
[35]	valid_0's l1: 28.8475
[36]	valid_0's l1: 28.681
[37]	valid_0's l1: 28.5626
[38]	valid_0

In [16]:
y_pred = model.predict(X_test)

## 提出ファイルの作成

In [17]:
submit_df = pd.DataFrame({'y': y_pred})
submit_df.index.name = 'id'
submit_df.to_csv('submission\submission_03_5.csv')

## 結果
- 25.90463