In [1]:
import sys
sys.path.append('D://isid_phm/src')

In [2]:
from libs.Dataset import Dataset
df = Dataset().load_data(reproduce=True, cutoff=False, write_pickel=False)
df.shape

(63727, 31)

In [3]:
df = Dataset().load_raw_data()
df.shape

(66576, 30)

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

from libs.Dataset import Dataset
from libs.engine_summarize.EngineSumBase import EngineSumBase
from libs.engine_summarize.EngineSumTimeGrad import EngineSumTimeGrad
from libs.engine_summarize.EngineSumLastDur import EngineSumLastDur
from libs.engine_summarize.EngineSumBasics import EngineSumBasics
from libs.standarzation import standarzation_x, encode_y, decode_z
from libs.get_train_valid_test import get_train_valid_test
from libs.io.submit import submitform
from libs.io.label_valid import label_valid, valid_engine_random
from libs.io.train_cut_off import cutoff_like_test

In [5]:
params = {
    'regenarate': True,
    'train_cutoff': True,
    'num_resample_train': 1,
    'scaling': True,
    'use_model': 'MLP',
    'model_params': {'hidden_layer_sizes': 12,
                     'activation': 'tanh',
                     'random_state': 3},
    'submit': False
}

In [6]:
REGENARATE = params['regenarate']
TRAIN_CUT = params['train_cutoff']
NUM_RESAMPLE_TRAIN = params['num_resample_train']
SCALING = params['scaling']
USE_MODEL = params['use_model']
model_params = params['model_params']
SUBMIT = params['submit']

In [39]:
# 元
df = Dataset().load_data(REGENARATE, TRAIN_CUT, NUM_RESAMPLE_TRAIN, False)

# エンジン別特徴量の作成
summarized_df = EngineSumBase().create_feature(df, REGENARATE)
summarized_df = EngineSumTimeGrad().create_feature(
    df, summarized_df, REGENARATE)
summarized_df = EngineSumLastDur().create_feature(
    df, summarized_df, REGENARATE)
summarized_df = EngineSumBasics().create_feature(
    df, summarized_df, REGENARATE)

# train, valid, testに分割
train, valid, test = get_train_valid_test(summarized_df)
x_learn = train.drop(['dead_duration'], axis=1).fillna(0)
y_learn = train['dead_duration'].fillna(0)
x_valid = valid.drop(['dead_duration'], axis=1).fillna(0)
y_valid = valid['dead_duration'].fillna(0)
x_test = test.drop(['dead_duration'], axis=1).fillna(0)

# trainで正規化を行う
if SCALING:
    x_learn, x_valid, x_test = standarzation_x(x_learn, x_valid, x_test)
    yz_learn = encode_y(y_learn, y_learn.mean(), y_learn.std())

# モデル学習
model_ex_dict = {
    'lm': 'LinearRegressio(**model_params)',
    'Lasso': 'Lasso(**model_params)',
    'Ridge': 'Ridge(**model_params)',
    'LGB': 'LGBMRegressor(**model_params)',
    'RF': 'RandomForestRegressor(**model_params)',
    'SVR': 'SVR(**model_params)',
    'MLP': 'MLPRegressor(**model_params)'
}

model = eval(model_ex_dict[USE_MODEL])
model.fit(x_learn, yz_learn)

# モデル評価
predict_z = pd.DataFrame(model.predict(x_valid), index=x_valid.index)
valid_score = mean_absolute_error(
    decode_z(predict_z, y_learn.mean(), y_learn.std()), y_valid)
print('Valid score:', valid_score)

Valid score: 23.179872966052162


In [43]:
# 置き換えver


# 生データの読み込み
all_df = Dataset().load_raw_data()

# validの作成
valid_engine = valid_engine_random(all_df, 30)
all_df = label_valid(all_df, valid_engine)

train = all_df[(all_df['is_train'] == 1) &
               (all_df['is_valid'] != 1)]
test = all_df[all_df['is_train'] == 0]
valid = all_df[all_df['is_valid'] == 1]

if TRAIN_CUT:
    # testデータみたいに、不完全なフライトデータにする
    cut_train = cutoff_like_test(train, train, NUM_RESAMPLE_TRAIN)
    merged_df = pd.concat([cut_train, test], axis=0)
else:
    merged_df = pd.concat([train, test], axis=0)

# validはカットオフ1回のみ実施
cut_valid = cutoff_like_test(valid, test, 1)
df = pd.concat([merged_df, cut_valid], axis=0)

# エンジン別特徴量の作成
summarized_df = EngineSumBase().create_feature(df, REGENARATE)
summarized_df = EngineSumTimeGrad().create_feature(
    df, summarized_df, REGENARATE)
summarized_df = EngineSumLastDur().create_feature(
    df, summarized_df, REGENARATE)
summarized_df = EngineSumBasics().create_feature(
    df, summarized_df, REGENARATE)

# train, valid, testに分割
train, valid, test = get_train_valid_test(summarized_df)
x_learn = train.drop(['dead_duration'], axis=1).fillna(0)
y_learn = train['dead_duration'].fillna(0)
x_valid = valid.drop(['dead_duration'], axis=1).fillna(0)
y_valid = valid['dead_duration'].fillna(0)
x_test = test.drop(['dead_duration'], axis=1).fillna(0)

# trainで正規化を行う
if SCALING:
    x_learn, x_valid, x_test = standarzation_x(x_learn, x_valid, x_test)
    yz_learn = encode_y(y_learn, y_learn.mean(), y_learn.std())

# モデル学習
model_ex_dict = {
    'lm': 'LinearRegressio(**model_params)',
    'Lasso': 'Lasso(**model_params)',
    'Ridge': 'Ridge(**model_params)',
    'LGB': 'LGBMRegressor(**model_params)',
    'RF': 'RandomForestRegressor(**model_params)',
    'SVR': 'SVR(**model_params)',
    'MLP': 'MLPRegressor(**model_params)'
}

model = eval(model_ex_dict[USE_MODEL])
model.fit(x_learn, yz_learn)

# モデル評価
predict_z = pd.DataFrame(model.predict(x_valid), index=x_valid.index)
valid_score = mean_absolute_error(
    decode_z(predict_z, y_learn.mean(), y_learn.std()), y_valid)
print('Valid score:', valid_score)

Valid score: 23.179872966052162


#### CV ver

In [8]:
# 生データの読み込み
raw_df = Dataset().load_raw_data()

In [29]:
FOLD_NUM = 5
# trainのエンジンNo
train_eg = raw_df[raw_df['is_train'] == 1]['engine_no'].unique()

# 5-foldにしたい
eg_split = np.array_split(train_eg, FOLD_NUM)

cv_df = pd.DataFrame(index=range(FOLD_NUM))
for i in range(FOLD_NUM):
    # validの作成
    all_df = label_valid(raw_df, eg_split[i])

    score_i = valid_evaluation(all_df)
    print('%s/%s score: %s' % (i+1, FOLD_NUM, score_i))
    cv_df.loc[i, 'score'] = score_i
    
cv_score = cv_df['score'].sum()/FOLD_NUM
print('CV score: %f' % cv_score)



1/5 score: 41.588814700891135




2/5 score: 35.248978069863675




3/5 score: 32.62707130535347
4/5 score: 25.701507632554772
5/5 score: 28.400236885006127
CV score: 32.713322




In [30]:
cv_df

Unnamed: 0,score
0,41.588815
1,35.248978
2,32.627071
3,25.701508
4,28.400237


In [28]:
def valid_evaluation(all_df):
    train = all_df[(all_df['is_train'] == 1) &
                   (all_df['is_valid'] != 1)]
    test = all_df[all_df['is_train'] == 0]
    valid = all_df[all_df['is_valid'] == 1]

    if TRAIN_CUT:
        # testデータみたいに、不完全なフライトデータにする
        cut_train = cutoff_like_test(train, train, NUM_RESAMPLE_TRAIN)
        merged_df = pd.concat([cut_train, test], axis=0)
    else:
        merged_df = pd.concat([train, test], axis=0)

    # validはカットオフ1回のみ実施
    cut_valid = cutoff_like_test(valid, test, 1)
    df = pd.concat([merged_df, cut_valid], axis=0)
    
    # エンジン別特徴量の作成
    summarized_df = EngineSumBase().create_feature(df, REGENARATE)
    summarized_df = EngineSumTimeGrad().create_feature(
        df, summarized_df, REGENARATE)
    summarized_df = EngineSumLastDur().create_feature(
        df, summarized_df, REGENARATE)
    summarized_df = EngineSumBasics().create_feature(
        df, summarized_df, REGENARATE)

    # train, valid, testに分割
    train, valid, test = get_train_valid_test(summarized_df)
    x_learn = train.drop(['dead_duration'], axis=1).fillna(0)
    y_learn = train['dead_duration'].fillna(0)
    x_valid = valid.drop(['dead_duration'], axis=1).fillna(0)
    y_valid = valid['dead_duration'].fillna(0)
    x_test = test.drop(['dead_duration'], axis=1).fillna(0)

    # trainで正規化を行う
    if SCALING:
        x_learn, x_valid, x_test = standarzation_x(x_learn, x_valid, x_test)
        yz_learn = encode_y(y_learn, y_learn.mean(), y_learn.std())

    # モデル学習
    model_ex_dict = {
        'lm': 'LinearRegressio(**model_params)',
        'Lasso': 'Lasso(**model_params)',
        'Ridge': 'Ridge(**model_params)',
        'LGB': 'LGBMRegressor(**model_params)',
        'RF': 'RandomForestRegressor(**model_params)',
        'SVR': 'SVR(**model_params)',
        'MLP': 'MLPRegressor(**model_params)'
    }

    model = eval(model_ex_dict[USE_MODEL])
    model.fit(x_learn, yz_learn)

    # モデル評価
    predict_z = pd.DataFrame(model.predict(x_valid), index=x_valid.index)
    valid_score = mean_absolute_error(
        decode_z(predict_z, y_learn.mean(), y_learn.std()), y_valid)
    return valid_score