In [6]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectKBest, RFECV, f_regression 
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import  SVR

X_test = pd.read_csv("X_test.csv")
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")

columns = X_train.columns[1:]

# convert to np arrays
X = X_train.iloc[:,1:].to_numpy()
y = y_train.iloc[:,1:].to_numpy().reshape(-1)
X_test = X_test.iloc[:,1:].to_numpy()


x_stack = np.vstack((X,X_test))
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=18)
x_stack = imputer.fit_transform(x_stack)
X = x_stack[:1212]
X_test = x_stack[1212:]

outliers = IsolationForest(n_estimators=1200, random_state=0).fit_predict(X)
#behaviour='new', max_samples=100, random_state = 1, contamination='auto'
X = X[np.where(outliers == 1)[0]].copy()
y = y[np.where(outliers == 1)[0]].copy()

selector = VarianceThreshold()
selector.fit(X)
to_delete = np.where(selector.variances_ == 0)[0]
X = np.delete(X, to_delete, axis=1)
X_test = np.delete(X_test, to_delete, axis=1)

selector = SelectKBest(f_regression, k=95)
selector.fit(X, y)
X = selector.transform(X)
X_test = selector.transform(X_test)
indices = selector.get_support(indices=True)



params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 1000,
    'learning_rate': 0.01,
    'feature_fraction': 0.95,
    'bagging_fraction': 0.9,
    'bagging_freq': 50,
    'verbose': 0
}
#params = best

import lightgbm as lgb
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=0)
lgb_train = lgb.Dataset(X_train, y_train)
#lgb_train = lgb.Dataset(X, y)
lgb_eval = lgb.Dataset(X_val, y_val)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=1000
                )



You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's l1: 7.46391	valid_0's l2: 86.5908
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l1: 7.41387	valid_0's l2: 85.5974
[3]	valid_0's l1: 7.36903	valid_0's l2: 84.6836
[4]	valid_0's l1: 7.3259	valid_0's l2: 83.815
[5]	valid_0's l1: 7.28372	valid_0's l2: 82.9871
[6]	valid_0's l1: 7.23984	valid_0's l2: 82.1285
[7]	valid_0's l1: 7.19223	valid_0's l2: 81.2177
[8]	valid_0's l1: 7.15634	valid_0's l2: 80.4811
[9]	valid_0's l1: 7.10973	valid_0's l2: 79.5953
[10]	valid_0's l1: 7.07622	valid_0's l2: 78.8934
[11]	valid_0's l1: 7.03537	valid_0's l2: 78.1384
[12]	valid_0's l1: 6.99263	valid_0's l2: 77.3337
[13]	valid_0's l1: 6.95413	valid_0's l2: 76.5843
[14]	valid_0's l1: 6.91993	valid_0's l2: 75.9383
[15]	valid_0's l1: 6.88748	valid_0's l2: 75.2398
[16]	valid_0's l1: 6.84986	valid_0's l2: 74.4906
[17]	valid_0's l1: 6.81483	valid_0's l2: 73.7665
[18]	valid_0's l1: 6.77537	valid_0's l2: 73.0264
[19]	val

## Hyperopt to tune hyperparameters ????

In [7]:
from sklearn.metrics import r2_score

y_val_pred = gbm.predict(X_val)
r_squared = r2_score(y_val, y_val_pred)
adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print("R-squared : ", r_squared)
print("Adjusted :", adjusted_r_squared)

R-squared :  0.6710273546657239
Adjusted : 0.6429479285141011


In [8]:
y_pred = gbm.predict(X_test)
X_test = pd.read_csv("X_test.csv")
df_ids = pd.DataFrame(X_test['id'])
df_predictions = df_ids.join(pd.DataFrame(y_pred, columns=['y']))
df_predictions.to_csv('model_results/gbm_y_test.csv', index=False)

In [9]:
y_pred = gbm.predict(X)
X = pd.read_csv("X_train.csv")
df_ids = pd.DataFrame(X['id'])
df_predictions = df_ids.join(pd.DataFrame(y_pred, columns=['y']))
df_predictions.to_csv('model_results/gbm_y_train.csv', index=False)

In [None]:
from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin
import lightgbm as lgb
import hyperopt
import lightgbm
import warnings
warnings.simplefilter("ignore")


random_state=42
n_iter = 1000

# trials will contain logging information

best_dict= {}
trials = Trials()
space = {
      'boosting_type': hp.choice('boosting_type',
                                ['gbdt','goss']),
      #dart
      'objective': hp.choice('objective',
                                ['regression','huber']),
      'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
      'colsample_bytree': hp.uniform('colsample_bytree', 0.1,0.9),
      #n_estimators': hp.quniform('n_estimators', 100, 3000, 1),
      'max_depth' : hp.quniform('max_depth', 2, 20, 1),
      'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
      'subsample':        hp.uniform('subsample', 0.5, 1),
      'reg_alpha':        hp.uniform('reg_alpha', 0.001, 1),
      'reg_lambda':        hp.uniform('reg_lambda', 0.001, 1),
      'num_leaves': hp.quniform('num_leaves', 10, 5000, 1),
      'n_rows': hp.quniform('n_rows', 1000, len(X_train), 1),
      'feature_fraction': hp.uniform('feature_fraction', 0.2, 1),
  }

def gb_mse_cv(params, random_state=random_state):

    print(params)
    params['task'] = 'train'
    params['metric'] = 'l2'
    params['verbose'] = -1
    params['verbose_eval'] = -1

    n_rows = int(params.pop('n_rows'))
    #params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])

    train_data = lgb.Dataset(X_train[-n_rows:], label=y_train[-n_rows:],free_raw_data=False)
    valid_data = lgb.Dataset(X_test, label=y_test,free_raw_data=False)
    gbm = lgb.train(params,train_data,valid_sets=(valid_data,train_data),early_stopping_rounds=100,verbose_eval=False)
    score = gbm.best_score['valid_0']['l2']
    print(score)
    return score


best=fmin(fn=gb_mse_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
        )
best['boosting_type'] = ['gbdt','goss'][best['boosting_type']]
best['objective'] = ['regression','huber'][best['objective']]
best['max_depth'] = int(best['max_depth'])
best['n_estimators'] = int(best['n_estimators'])
best['num_leaves'] = int(best['num_leaves'])
best 

In [None]:
best

In [19]:
best = {'boosting_type': 'gbdt',
 'colsample_bytree': 0.5883400003755409,
 'learning_rate': 0.10708554906532587,
 'max_depth': 17,
 'min_child_weight': 4,
 'n_rows': 1045.0,
 'num_leaves': 91,
 'objective': 'regression',
 'reg_alpha': 0.03844202878778796,
 'reg_lambda': 0.10271853898205738,
 'subsample': 0.600453777629902}

In [None]:
'boosting_type': 'gbdt',
 'colsample_bytree': 0.489874680107508,
 'learning_rate': 0.0884474182366333,
 'max_depth': 7,
 'min_child_weight': 0,
 'n_rows': 1045.0,
 'num_leaves': 4606.0,
 'objective': 'regression',
 'reg_alpha': 0.4428519088658227,
 'reg_lambda': 0.22634346543652997,
 'subsample': 0.7789138943694276}