#combination of three models

In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

In [2]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

test_df['primary_use'] = le.fit_transform(test_df['primary_use']).astype(np.int8)

#change meter_reading value for site_0 train data according to Sohier Dane from Kaggle Team
#https://www.kaggle.com/c/ashrae-energy-prediction/discussion/119261
train_df[train_df["site_id"]==0].meter_reading = train_df[train_df["site_id"]==0].meter_reading * 0.2931

test_df["month"] = test_df["timestamp"].apply(lambda x: int(x[5:7]))
test_df["hour"] = test_df["timestamp"].apply(lambda x: int(x[11:13]))

#handling missing values
train_df['floor_count'] = train_df['floor_count'].fillna(-999).astype(np.int16)
test_df['floor_count'] = test_df['floor_count'].fillna(-999).astype(np.int16)

train_df['year_built'] = train_df['year_built'].fillna(-999).astype(np.int16)
test_df['year_built'] = test_df['year_built'].fillna(-999).astype(np.int16)

train_df['age'] = train_df['age'].fillna(-999).astype(np.int16)
test_df['age'] = test_df['age'].fillna(-999).astype(np.int16)

train_df['cloud_coverage'] = train_df['cloud_coverage'].fillna(-999).astype(np.int16)
test_df['cloud_coverage'] = test_df['cloud_coverage'].fillna(-999).astype(np.int16) 

del train_df["timestamp"], test_df["timestamp"]
categoricals = ["site_id", "building_id", "primary_use",  "meter",  "month", "hour", "day_of_week"]
drop_cols = ["sea_level_pressure", "wind_speed","wind_direction"]

numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage",
              "dew_temperature", 'precip_depth_1_hr', 'floor_count', 'beaufort_scale']

feat_cols = categoricals + numericals
target = np.log1p(train_df["meter_reading"])

del train_df["meter_reading"] 

train_df = train_df.drop(drop_cols, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [4]:
# k-fold
folds = 5
seed = 2019
shuffle = False
kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)

In [5]:
#train_X = train_df[feat_cols].iloc[train_index]
#val_X = train_df[feat_cols].iloc[val_index]
#train_y = target.iloc[train_index]
#val_y = target.iloc[val_index]

In [6]:
#lgbm
import lightgbm as lgb

def lgb_model(train_x,train_y,val_x,val_y,test,verbose):
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample_freq': 1,
            'learning_rate': 0.3,
            'bagging_freq': 5,
            'num_leaves': 330,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }
    lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categoricals)
    lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categoricals)
    model = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=50,
                verbose_eval = 50)
    best_idx = np.argmin(np.array(record['valid_0']['rmse']))

    val_pred = model.predict(val_x, num_iteration = model.best_iteration)
    test_pred = model.predict(test, num_iteration = model.best_iteration)
    
    return {'val':val_pred, 'test':test_pred, 'error':record['valid_0']['rmse'][best_idx], 'importance':model.feature_importance('gain')}

In [7]:
#XGBoosting
import libxgboost as xgb

def xgb_model(train_x,train_y,val_x,val_y,test,verbose):
    
    params = {'objective': 'reg:linear', 
              'eta': 0.01, 
              'max_depth': 6, 
              'subsample': 0.6, 
              'colsample_bytree': 0.7,  
              'eval_metric': 'rmse', 
              'seed': random_seed, 
              'silent': True,
    }
    
    record = dict()
    model = xgb.train(params
                      , xgb.DMatrix(train_x, train_y)
                      , 100000
                      , [(xgb.DMatrix(train_x, train_y), 'train'), (xgb.DMatrix(val_x, val_y), 'valid')]
                      , verbose_eval=verbose
                      , early_stopping_rounds=500
                      , callbacks = [xgb.callback.record_evaluation(record)])
    best_idx = np.argmin(np.array(record['valid']['rmse']))

    val_pred = model.predict(xgb.DMatrix(val_x), ntree_limit=model.best_ntree_limit)
    test_pred = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)

    return {'val':val_pred, 'test':test_pred, 'error':record['valid']['rmse'][best_idx], 'importance':[i for k, i in model.get_score().items()]}

ModuleNotFoundError: No module named 'libxgboost'

In [None]:
#Random Forest
#from sklearn.ensemble import forest
#def ran_for(train_x,train_y,val_x,val_y,test):
    
#model = RandomForestRegressor(n_estimators=60,
#                             random_state=0,n_jobs=-1)
#   model.fit(train_x,train_y)

In [None]:
result_dict = dict()
val_pred = np.zeros(train_df.shape[0])
test_pred = np.zeros(test_df.shape[0])
final_err = 0
verbose = False

for i, (trn, val) in enumerate(kf) :
    print(i+1, "fold.    RMSE")
    
    trn_x = train_df.loc[trn, :]
    trn_y = y[trn]
    val_x = train_df.loc[val, :]
    val_y = y[val]
    
    fold_val_pred = []
    fold_test_pred = []
    fold_err = []
    
    #""" xgboost
    start = datetime.now()
    result = xgb_model(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val']*0.2)
    fold_test_pred.append(result['test']*0.2)
    fold_err.append(result['error'])
    print("xgb model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    #"""
    
    #""" lightgbm
    start = datetime.now()
    result = lgb_model(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val']*0.4)
    fold_test_pred.append(result['test']*0.4)
    fold_err.append(result['error'])
    print("lgb model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    #"""
    
    # mix result of multiple models
    val_pred[val] += np.mean(np.array(fold_val_pred), axis = 0)
    #print(fold_test_pred)
    #print(fold_test_pred.shape)
    #print(fold_test_pred.columns)
    test_pred += np.mean(np.array(fold_test_pred), axis = 0) / k
    final_err += (sum(fold_err) / len(fold_err)) / k
    
    print("---------------------------")
    print("avg   err.", "{0:.5f}".format(sum(fold_err) / len(fold_err)))
    print("blend err.", "{0:.5f}".format(np.sqrt(np.mean((np.mean(np.array(fold_val_pred), axis = 0) - val_y)**2))))
    
    print('')
    
print("fianl avg   err.", final_err)
print("fianl blend err.", np.sqrt(np.mean((val_pred - y)**2)))