In [215]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings('ignore')

In [245]:
path = "../input/"

train = pd.read_csv(path + 'train.csv')
train_x = train.drop(['quality'], axis=1)
train_y = train['quality']
test_x = pd.read_csv(path + 'test.csv')

In [208]:
train_x['hydrogen ion concentration'] = 10 ** -train_x['pH']
test_x['hydrogen ion concentration'] = 10 ** -test_x['pH']

In [234]:
train_x['molecule sulfur dioxide'] = train_x['free sulfur dioxide'] / (1 + 10 ** (train_x['pH'] - 1.81))
test_x['molecule sulfur dioxide'] = test_x['free sulfur dioxide'] / (1 + 10 ** (test_x['pH'] - 1.81))

In [235]:
label_encoding = lambda x: 0 if x < 0.6 else 1 if x > 2 else 2

train_x['molecule sulfur dioxide'] = train_x['molecule sulfur dioxide'].map(label_encoding)
test_x['molecule sulfur dioxide'] = test_x['molecule sulfur dioxide'].map(label_encoding)

In [246]:
train_x["fixed acidity percentage"] = train_x["fixed acidity"] + train_x["citric acid"]
test_x["fixed acidity percentage"] = test_x["fixed acidity"] + test_x["citric acid"]

In [247]:
# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [248]:
# -----------------------------------
# lightgbmの実装
# -----------------------------------
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [249]:
# 特徴量と目的変数をlightgbmのデータ構造に変換する
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

In [250]:
# ハイパーパラメータの設定
params = {'objective': 'regression', 'seed': 71, 'verbose': 0, 'metrics': 'mean_squared_error'}
num_round = 100

In [251]:
# 学習の実行
# カテゴリ変数をパラメータで指定している
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
model = lgb.train(params, lgb_train, num_boost_round=num_round,
                  valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval])

[1]	train's l2: 0.87641	valid's l2: 1.00553
[2]	train's l2: 0.815832	valid's l2: 0.966936
[3]	train's l2: 0.763038	valid's l2: 0.934815
[4]	train's l2: 0.716062	valid's l2: 0.904466
[5]	train's l2: 0.675482	valid's l2: 0.880013
[6]	train's l2: 0.639899	valid's l2: 0.861665
[7]	train's l2: 0.609341	valid's l2: 0.851518
[8]	train's l2: 0.579725	valid's l2: 0.841708
[9]	train's l2: 0.553545	valid's l2: 0.830211
[10]	train's l2: 0.529833	valid's l2: 0.829726
[11]	train's l2: 0.509649	valid's l2: 0.826277
[12]	train's l2: 0.48989	valid's l2: 0.825687
[13]	train's l2: 0.473523	valid's l2: 0.826513
[14]	train's l2: 0.457422	valid's l2: 0.825601
[15]	train's l2: 0.441599	valid's l2: 0.830867
[16]	train's l2: 0.427	valid's l2: 0.834944
[17]	train's l2: 0.414333	valid's l2: 0.838372
[18]	train's l2: 0.401018	valid's l2: 0.838215
[19]	train's l2: 0.388035	valid's l2: 0.833579
[20]	train's l2: 0.376145	valid's l2: 0.830016
[21]	train's l2: 0.36427	valid's l2: 0.824608
[22]	train's l2: 0.352857	val

In [252]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(va_x)
score = np.sqrt(mean_squared_error(va_y, va_pred))
score

0.9488042621608949

In [243]:
# 予測
pred = model.predict(test_x)

In [244]:
va_pred

array([6.68551365, 4.98307402, 5.23989457, 6.21593111, 5.22706741,
       6.38446998, 5.43497012, 4.95554896, 4.89887902, 4.9388609 ,
       6.87203432, 5.57579633, 5.83054339, 4.82024988, 6.79124029,
       5.46850744, 4.70164606, 5.66155662, 5.55648158, 5.44109867,
       5.34250347, 6.07418581, 5.19095503, 5.95652169, 5.50054965,
       5.50022217, 5.51831017, 5.63629545, 4.84286756, 5.20581941,
       7.06548952, 7.06399437, 4.82474802, 7.36599072, 6.83167289,
       4.84902092, 5.5729125 , 6.46520759, 5.66103889, 5.30784705,
       7.09528805, 5.59294915, 5.03852727, 5.11806071, 5.14031679,
       5.07307029, 5.54544125, 4.79171112, 6.53160328, 5.97399105,
       5.27590232, 5.74692399, 5.45972296, 5.28925057, 5.74256702,
       4.90798914, 5.40589482, 5.60876892, 5.38905596, 6.24686151,
       4.94931711, 5.46877746, 6.47108097, 6.90236522, 4.99555939,
       5.06803815, 5.00220798, 4.86440318, 6.08796309, 5.37209037,
       5.9200869 , 5.42879978, 5.60593608, 4.97636701, 6.48083

In [86]:
va_pred.dtype

dtype('float64')

In [65]:
 # 読み込むデータが格納されたディレクトリのパス，必要に応じて変更の必要あり
path = "../submission/"

submission = pd.read_csv(path + 'submission.csv')
submission

Unnamed: 0,quality
0,5.0
1,5.0
2,5.0
3,5.0
4,5.0
5,5.0
6,5.0
7,5.0
8,5.0
9,5.0


In [66]:
submission['quality'] = pred
submission

Unnamed: 0,quality
0,5.318777
1,4.853884
2,5.784591
3,6.568259
4,6.372332
5,5.398701
6,5.329768
7,5.396297
8,6.037020
9,5.336273


In [68]:
submission.to_csv(path + 'production_submission.csv', index=False)

In [253]:
features = [f'feat_{i}' for i in range(1, 94)]

In [254]:
features

['feat_1',
 'feat_2',
 'feat_3',
 'feat_4',
 'feat_5',
 'feat_6',
 'feat_7',
 'feat_8',
 'feat_9',
 'feat_10',
 'feat_11',
 'feat_12',
 'feat_13',
 'feat_14',
 'feat_15',
 'feat_16',
 'feat_17',
 'feat_18',
 'feat_19',
 'feat_20',
 'feat_21',
 'feat_22',
 'feat_23',
 'feat_24',
 'feat_25',
 'feat_26',
 'feat_27',
 'feat_28',
 'feat_29',
 'feat_30',
 'feat_31',
 'feat_32',
 'feat_33',
 'feat_34',
 'feat_35',
 'feat_36',
 'feat_37',
 'feat_38',
 'feat_39',
 'feat_40',
 'feat_41',
 'feat_42',
 'feat_43',
 'feat_44',
 'feat_45',
 'feat_46',
 'feat_47',
 'feat_48',
 'feat_49',
 'feat_50',
 'feat_51',
 'feat_52',
 'feat_53',
 'feat_54',
 'feat_55',
 'feat_56',
 'feat_57',
 'feat_58',
 'feat_59',
 'feat_60',
 'feat_61',
 'feat_62',
 'feat_63',
 'feat_64',
 'feat_65',
 'feat_66',
 'feat_67',
 'feat_68',
 'feat_69',
 'feat_70',
 'feat_71',
 'feat_72',
 'feat_73',
 'feat_74',
 'feat_75',
 'feat_76',
 'feat_77',
 'feat_78',
 'feat_79',
 'feat_80',
 'feat_81',
 'feat_82',
 'feat_83',
 'feat_84',
 