# 給与推定コンペ

## ライブラリのインポート

In [6]:
import numpy as np
import pandas as pd
import japanize_matplotlib

from sklearn.model_selection import train_test_split
import lightgbm as lgb

## データの読み込み

In [2]:
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

<div align="center">
    <img src="img/column_description.png">
</div>

## 前処理

In [3]:
# Label Encoding
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
train['area'] = le.fit_transform(train['area'])
test['area'] = le.transform(test['area'])

## 学習

In [7]:
X = train.iloc[:, 1:-1]
y = train.iloc[:, -1]
X_test = test.iloc[:, 1:]

X.shape, y.shape, X_test.shape

((21000, 11), (21000,), (9000, 11))

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [9]:
train_data = lgb.Dataset(X_train, y_train)
valid_data = lgb.Dataset(X_valid, y_valid)

In [10]:
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric': 'mae',
    'verbose' : 0
}

In [11]:
categorical_features = [
    'area', 'sex', 'partner', 'education'
]

In [12]:
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=valid_data,
    categorical_feature=categorical_features
)

New categorical_feature is ['area', 'education', 'partner', 'sex']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's l1: 127.765
[2]	valid_0's l1: 116.625
[3]	valid_0's l1: 106.467
[4]	valid_0's l1: 97.4349
[5]	valid_0's l1: 89.5079
[6]	valid_0's l1: 82.472
[7]	valid_0's l1: 76.3028
[8]	valid_0's l1: 70.7226
[9]	valid_0's l1: 65.808
[10]	valid_0's l1: 61.3115
[11]	valid_0's l1: 57.3379
[12]	valid_0's l1: 53.552
[13]	valid_0's l1: 50.2398
[14]	valid_0's l1: 47.2402
[15]	valid_0's l1: 44.708
[16]	valid_0's l1: 42.4023
[17]	valid_0's l1: 40.3243
[18]	valid_0's l1: 38.5203
[19]	valid_0's l1: 36.8859
[20]	valid_0's l1: 35.4384
[21]	valid_0's l1: 34.1748
[22]	valid_0's l1: 33.1067
[23]	valid_0's l1: 32.175
[24]	valid_0's l1: 31.393
[25]	valid_0's l1: 30.6549
[26]	valid_0's l1: 30.0202
[27]	valid_0's l1: 29.4472
[28]	valid_0's l1: 28.9712
[29]	valid_0's l1: 28.5521
[30]	valid_0's l1: 28.1131
[31]	valid_0's l1: 27.7228
[32]	valid_0's l1: 27.3083
[33]	valid_0's l1: 27.0185
[34]	valid_0's l1: 26.7961
[35]	valid_0's l1: 26.5672
[36]	valid_0's l1: 26.3757
[37]	valid_0's l1: 26.2463
[38]	valid_0's l

In [13]:
y_pred = model.predict(X_test)

## 提出ファイルの作成

In [14]:
submit_df = pd.DataFrame({'y': y_pred})
submit_df.index.name = 'id'
submit_df.to_csv('submission\submission_lgb.csv')

## 結果
- 25.90463