In [41]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from scipy import sparse
import warnings
import time
import sys
import os
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('max_colwidth',100)

In [42]:
train = pd.read_csv("./donejobs.cleaned.csv")

In [43]:
df_added_features =\
        train['jobName'].str.split(
            ":",
            expand=True)[[1, 2]].rename(columns={1: "jobName_1",
                                                 2: "jobName_2"})
train = pd.concat([train, df_added_features], axis=1)

In [44]:
del train['jobName']
train.head()

Unnamed: 0,memReq,numProcessors,avgMem,fromHost,maxMem,projectName,userGroup,userName,cpuTime,queue,command,jobName_1,jobName_2
0,2.7373,1,1.1858,sj-wkchow,1.8959,EDIRND,icd_1,wkchow,124.374,qosH,bash /vols/feflowrd_t1b_001/wkchow/sdisk400/qos/rundir/qos_run/TI_ddr_phys_28nm/TI_ddr_phys_28nm...,humtest,TI_ddr_phys_28nm
1,4.5615,1,1.7877,sj-wkchow,3.2384,EDIRND,icd_1,wkchow,222.2,qosH,bash /vols/feflowrd_t1b_001/wkchow/sdisk400/qos/rundir/qos_run/MMMC_brcm_AVD/MMMC_brcm_AVD_640_1...,humtest,MMMC_brcm_AVD
2,2.0156,1,0.7991,sj-wkchow,1.3901,EDIRND,icd_1,wkchow,77.424,qosH,bash /vols/feflowrd_t1b_001/wkchow/sdisk400/qos/rundir/qos_run/GF_trides_mac_20lpm/GF_trides_mac...,humtest,GF_trides_mac_20lpm
3,3.1758,1,1.3983,sj-wkchow,2.0619,EDIRND,icd_1,wkchow,133.506,qosH,bash /vols/feflowrd_t1b_001/wkchow/sdisk400/qos/rundir/qos_run/FSL_e6500_core_asvaaa_28nm/FSL_e6...,humtest,FSL_e6500_core_asvaaa_28nm
4,1.4219,1,0.604,sj-wkchow,0.92,EDIRND,icd_1,wkchow,97.877,qosH,bash /vols/feflowrd_t1b_001/wkchow/sdisk400/qos/rundir/qos_run/lym0_20nm/lym0_20nm_640_149434878...,humtest,lym0_20nm


In [45]:
for f in ['numProcessors','fromHost','projectName','userGroup','userName','queue','command','jobName_1','jobName_2']:
    train[f] = train[f].map(dict(zip(train[f].unique(), range(0, train[f].nunique()))))
train.head()

Unnamed: 0,memReq,numProcessors,avgMem,fromHost,maxMem,projectName,userGroup,userName,cpuTime,queue,command,jobName_1,jobName_2
0,2.7373,0,1.1858,0,1.8959,0,0.0,0,124.374,0,0,0.0,0.0
1,4.5615,0,1.7877,0,3.2384,0,0.0,0,222.2,0,1,0.0,1.0
2,2.0156,0,0.7991,0,1.3901,0,0.0,0,77.424,0,2,0.0,2.0
3,3.1758,0,1.3983,0,2.0619,0,0.0,0,133.506,0,3,0.0,3.0
4,1.4219,0,0.604,0,0.92,0,0.0,0,97.877,0,4,0.0,4.0


In [46]:
train = train.fillna(-1)
df_train, df_test = train_test_split(train, test_size=0.19999, train_size=0.8)
df_predict = train.drop(df_train.index.union(df_test.index)).reset_index()

In [47]:
y_train = df_train['maxMem'].values
del df_train['maxMem']

In [49]:
features = [f for f in train.columns if f not in ['maxMem']]

X_train = df_train[features].values
X_test = df_test[features].values
# one hot
enc = OneHotEncoder(categories='auto')
for f in features:
    enc.fit(train[f].values.reshape(-1, 1))
    X_train = sparse.hstack((X_train, enc.transform(df_train[f].values.reshape(-1, 1))), 'csr')
    X_test = sparse.hstack((X_test, enc.transform(df_test[f].values.reshape(-1, 1))), 'csr')
print(X_train.shape)
print(X_test.shape)

(320270, 960923)
(80064, 960923)


In [51]:
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}

folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(df_train))
predictions_lgb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    num_boost_round=10000, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 200, 
                    early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("LGB score: {}".format((mean_squared_error(oof_lgb, target))))


fold n°1
Training until validation scores don't improve for 200 rounds.
[200]	training's l2: 90.4562	training's lgbFeval: 45.2281	valid_1's l2: 108.69	valid_1's lgbFeval: 54.3452
[400]	training's l2: 59.7794	training's lgbFeval: 29.8897	valid_1's l2: 86.511	valid_1's lgbFeval: 43.2555
[600]	training's l2: 49.276	training's lgbFeval: 24.638	valid_1's l2: 80.3465	valid_1's lgbFeval: 40.1732
[800]	training's l2: 43.3023	training's lgbFeval: 21.6511	valid_1's l2: 77.7368	valid_1's lgbFeval: 38.8684
[1000]	training's l2: 39.1033	training's lgbFeval: 19.5517	valid_1's l2: 76.2729	valid_1's lgbFeval: 38.1364
[1200]	training's l2: 35.6702	training's lgbFeval: 17.8351	valid_1's l2: 75.0306	valid_1's lgbFeval: 37.5153
[1400]	training's l2: 32.9537	training's lgbFeval: 16.4768	valid_1's l2: 74.3181	valid_1's lgbFeval: 37.1591
[1600]	training's l2: 30.6471	training's lgbFeval: 15.3236	valid_1's l2: 73.6449	valid_1's lgbFeval: 36.8225
[1800]	training's l2: 28.6926	training's lgbFeval: 14.3463	valid

KeyboardInterrupt: 