In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import ShuffleSplit
import lightgbm as lgb
import numpy as np
import pandas as pd

import os
import sys
sys.path.append('C:\\Users\\yu886\\OneDrive\\デスクトップ\\github\\Sony\\src')

from Processing.load_dataset import Load_dataset

from Engineering.dummy import engin
from Valid.validation import rmse

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter('ignore', category=UserWarning)
import matplotlib.pyplot as plt
from Processing.processing import Submission

from Engineering.dummy import engin_2
import xgboost as xgb

In [2]:
def add(df_train):
    df_train["co_avg"] = df_train["co_mid"] / df_train["co_cnt"]
    df_train["o3_avg"] = df_train["o3_mid"] / df_train["o3_cnt"]
    df_train["so2_avg"] = df_train["so2_mid"] / df_train["so2_cnt"]
    df_train["no2_avg"] = df_train["no2_mid"] / df_train["no2_cnt"]
    df_train["temperature_avg"] = df_train["temperature_mid"] / df_train["temperature_cnt"]
    df_train["humidity_avg"] = df_train["humidity_mid"] / df_train["humidity_cnt"]
    df_train["pressure_avg"] = df_train["pressure_mid"] / df_train["pressure_cnt"]
    df_train["ws_avg"] = df_train["ws_mid"] / df_train["ws_cnt"]
    df_train["dew_avg"] = df_train["dew_mid"] / df_train["dew_cnt"]
    return df_train

def add_2(df):
    df["discomfort"] = 0.81*df["temperature_mid"] + 0.01*df["humidity_mid"]*(0.99*df["temperature_mid"] - 14.3) + 46.3
    return df

def add_3(df):
    df["avg_count"] = (df_train["co_cnt"] + df_train["o3_cnt"] + df_train["so2_cnt"] + df_train["no2_cnt"] +
                       df_train["temperature_cnt"] + df_train["humidity_cnt"] + df_train["pressure_cnt"] + df_train["ws_cnt"] + df_train["dew_cnt"]) / 9
    return df

In [3]:
train, test = Load_dataset()

df_train = engin_2(train)
df_test = engin_2(test)

df_train = add(df_train)
df_test = add(df_test)

df_train = add_2(df_train)
df_test = add_2(df_test)

df_train = add_3(df_train)
df_test = add_3(df_test)

target = df_train["pm25_mid"]

df_train = df_train.drop(["id","pm25_mid"], axis=1)
df_test = df_test.drop(["id"], axis=1)

In [None]:
FOLD = 5
NUM_ROUND = 2000
VERBOSE_EVAL = -1

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate':0.01,
    'max_depth':6,
}

valid_scores = []
models = []
ss = ShuffleSplit(n_splits=FOLD, test_size=0.25, random_state=123)

for fold, (train_indices, valid_indices) in enumerate(ss.split(df_train)):
    X_train, X_valid = df_train.iloc[train_indices], df_train.iloc[valid_indices]
    y_train, y_valid = target.iloc[train_indices], target.iloc[valid_indices]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_valid, label=y_valid)
    evals = [(dtrain, 'train'), (dtest, 'eval')]

    model = xgb.train(params,
                      dtrain,
                      num_boost_round=NUM_ROUND,
                      evals=evals,
                      early_stopping_rounds=20,
                      verbose_eval=200
                      )
    dm_valid = xgb.DMatrix(X_valid)
    y_valid_pred = model.predict(dm_valid)
    MSE = mean_squared_error(y_valid, y_valid_pred)
    r2 = r2_score(y_valid, y_valid_pred)
    MAE = mean_absolute_error(y_valid, y_valid_pred)
    
    print(f'fold {fold} MSE: {MSE} RMSE: {np.sqrt(MSE)} R^2: {r2} MAE: {MAE}')
    valid_scores.append(np.sqrt(MSE))
    models.append(model)

cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')

In [14]:
xgb_test = xgb.DMatrix(df_test)

pred_1 = models[0].predict(xgb_test)
pred_2 = models[1].predict(xgb_test)
pred_3 = models[2].predict(xgb_test)
pred_4 = models[3].predict(xgb_test)
pred_5 = models[4].predict(xgb_test)

pred = np.stack([pred_1, pred_2, pred_3 ,pred_4, pred_5], axis=1)
pred = np.mean(pred, axis=1)

index = test["id"]
Submission(index, pred, name='xgb_count_avg_var')

In [15]:
pred

array([26.476385, 40.119514, 25.472378, ..., 67.5683  , 34.386482,
       40.19314 ], dtype=float32)

In [4]:
from sklearn.model_selection import KFold

publisher_train = train['City']
unique_publisher = train['City'].unique()

In [22]:
FOLD = 5
NUM_ROUND = 10000
VERBOSE_EVAL = -1

params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'learning_rate':0.01
}

valid_scores = []
models = []

kf = KFold(n_splits=FOLD, shuffle=True, random_state=123)
for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_publisher)):
    tr_groups, va_groups = unique_publisher[tr_group_idx], unique_publisher[va_group_idx]
    
    is_tr = publisher_train.isin(tr_groups)
    is_va = publisher_train.isin(va_groups)
    # ここでCityの情報を消したい
    
    X_train, X_valid = df_train[is_tr], df_train[is_va]
    y_train, y_valid = target[is_tr], target[is_va]
    
    X_train = X_train.drop("City" ,axis=1)
    X_valid = X_valid.drop("City" ,axis=1)
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=NUM_ROUND,
        verbose_eval=VERBOSE_EVAL
    )

    y_valid_pred = model.predict(X_valid)
    MSE = mean_squared_error(y_valid, y_valid_pred)
    r2 = r2_score(y_valid, y_valid_pred)
    MAE = mean_absolute_error(y_valid, y_valid_pred)
    
    print(f'fold {fold} MSE: {MSE} RMSE: {np.sqrt(MSE)} R^2: {r2} MAE: {MAE}')
    valid_scores.append(np.sqrt(MSE))
    models.append(model)

cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')

fold 0 MSE: 433.65964670210286 RMSE: 20.824496313286975 R^2: 0.6273265696251071 MAE: 15.144815952647726
fold 1 MSE: 490.4381543630323 RMSE: 22.145838307976337 R^2: 0.6555928430461975 MAE: 16.036445026494356
fold 2 MSE: 466.7595585954529 RMSE: 21.60461891807983 R^2: 0.6451460579103949 MAE: 15.529845632290147
fold 3 MSE: 516.1983216573208 RMSE: 22.719998275909283 R^2: 0.6583782319676745 MAE: 16.527023843550708
fold 4 MSE: 621.7459240717869 RMSE: 24.934833548106692 R^2: 0.6951277199706589 MAE: 16.70520602034777
CV score: 22.445957072671824


In [26]:
# df_test = df_test.drop('City', axis=1)
pred_1 = models[0].predict(df_test)
pred_2 = models[1].predict(df_test)
pred_3 = models[2].predict(df_test)
pred_4 = models[3].predict(df_test)
pred_5 = models[4].predict(df_test)

pred = np.stack([pred_1, pred_2, pred_3 ,pred_4, pred_5], axis=1)
pred = np.mean(pred, axis=1)

index = test["id"]
Submission(index, pred, name='lgb_group')
pred

array([25.77790191, 39.14268359, 24.63281223, ..., 67.68930128,
       34.0965532 , 37.50456546])