In [1]:
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

In [2]:
pd.options.display.max_rows = 5000

In [2]:
import os
os.listdir()

['dataset',
 '.gitignore',
 '.ipynb_checkpoints',
 '1000_features_xgb_1.42.ipynb',
 '.git',
 '1000_features_xgb_lgb_average_1.43.ipynb',
 'Untitled.ipynb',
 'submission.csv']

In [3]:
train_df = pd.read_csv('dataset/train.csv', index_col='ID')

In [4]:
test_df = pd.read_csv('dataset/test.csv', index_col='ID')

In [6]:
train_df.head()

Unnamed: 0_level_0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,2200000.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,2000000.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4459 entries, 000d6aaf2 to ffeb15d25
Columns: 4992 entries, target to 9fc776466
dtypes: float64(1845), int64(3147)
memory usage: 169.9+ MB


In [7]:
train_feat = train_df.loc[:, train_df.columns != 'target']
target = train_df.loc[:, 'target']

In [8]:
## dropping columns with zero deviation
dropped_cols = []
for col in train_feat.columns:
    std = train_feat[col].describe().loc['std']
    if std == 0:
        dropped_cols.append(col)

In [9]:
train_feat = train_feat.drop(labels=dropped_cols, axis=1)

In [10]:
train_feat.shape

(4459, 4735)

In [11]:
## dropping duplicate columns
train_feat = train_feat.T.drop_duplicates().T

In [12]:
train_feat.shape

(4459, 4730)

In [13]:
## dropping column that have less than 1% non-zero values
non_zero_perc = []
low_non_zero = []
for col in train_feat.columns:
    perc = train_feat[col].astype(bool).sum(axis=0)/len(train_feat)*100
    non_zero_perc.append(perc)
#     print('column: {}  non zero values: {:.2f}%'.format(col, perc))
    if perc <= 1.0:
        low_non_zero.append(col)

In [14]:
non_zero_perc = np.array(non_zero_perc)

In [15]:
len(non_zero_perc[non_zero_perc <= 1.0])

2103

In [16]:
train_feat.drop(labels=low_non_zero, axis=1, inplace=True)

In [17]:
train_feat.shape

(4459, 2627)

In [18]:
target = np.log1p(target)

In [19]:
test_df = test_df[train_feat.columns]

In [20]:
test_df.shape

(49342, 2627)

In [21]:
train_feat.shape

(4459, 2627)

In [22]:
def add_statistics(data, test):
    # This is part of the trick I think, plus lightgbm has a special process for NaNs
#     data.replace(0, np.nan, inplace=True)
#     test.replace(0, np.nan, inplace=True)
    
    original_features = [f for f in data.columns if f not in ['target', 'ID']]
    for df in [data, test]:
        df['nb_nans'] = df[original_features].isnull().sum(axis=1)
        # All of the stats will be computed without the 0s 
        df['the_median'] = df[original_features].median(axis=1)
        df['the_mean'] = df[original_features].mean(axis=1)
        df['the_sum'] = df[original_features].sum(axis=1)
        df['the_std'] = df[original_features].std(axis=1)
        df['the_kur'] = df[original_features].kurtosis(axis=1)
        
    return data, test

In [23]:
train_feat, test_df = add_statistics(train_feat, test_df)

In [24]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer

In [41]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.power(y_true - y_pred, 2)))

In [26]:
X_train, X_test, y_train, y_test = train_test_split(train_feat, target, test_size=0.2, shuffle=True)

In [27]:
def rmsle_cv(model, X, y):
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(X.values)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [28]:
model = XGBRegressor(colsample_bytree=0.055, colsample_bylevel=0.5,
                    gamma=1.5, learning_rate=0.02, max_depth=32, 
                    objective='reg:linear',booster='gbtree',
                    min_child_weight=57, n_estimators=1000, reg_alpha=0, 
                    reg_lambda = 0,eval_metric = 'rmse', subsample=0.7, 
                    silent=1, n_jobs = -1, early_stopping_rounds = 14,
                    random_state =7)

In [29]:
scores = rmsle_cv(model, X_train, y_train)
print(scores)

[1.39873623 1.39540891 1.39904806 1.47678582 1.38782765]


In [30]:
print('{} +- {}'.format(scores.mean(), 2*scores.std()))

1.4115613363517991 +- 0.06572359002378447


In [31]:
%%time
model.fit(X_train, y_train)

CPU times: user 18min 25s, sys: 603 ms, total: 18min 25s
Wall time: 5min 58s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.055, early_stopping_rounds=14,
       eval_metric='rmse', gamma=1.5, learning_rate=0.02, max_delta_step=0,
       max_depth=32, min_child_weight=57, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=7,
       reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=None, silent=1,
       subsample=0.7)

In [32]:
non_zero_importance_cols = pd.DataFrame({'feature': X_train.columns, 'importance': model.feature_importances_}).sort_values(
    by=['importance'], ascending=[False])
non_zero_importance_cols = non_zero_importance_cols[non_zero_importance_cols.importance > 0]

In [33]:
train_feat = train_feat[non_zero_importance_cols.feature.values]
test_df = test_df[non_zero_importance_cols.feature.values]

In [34]:
train_feat.shape

(4459, 1359)

In [35]:
magic_features = [ \
        'f190486d6', 'c47340d97', 'eeb9cd3aa', '66ace2992', 'e176a204a',
        '491b9ee45', '1db387535', 'c5a231d81', '0572565c2', '024c577b9',
        '15ace8c9f', '23310aa6f', '9fd594eec', '58e2e02e6', '91f701ba2',
        'adb64ff71', '2ec5b290f', '703885424', '26fc93eb7', '6619d81fc',
        '0ff32eb98', '70feb1494', '58e056e12', '1931ccfdd', '1702b5bf0',
        '58232a6fb', '963a49cdc', 'fc99f9426', '241f0f867', '5c6487af1',
        '62e59a501', 'f74e8f13d', 'fb49e4212', '190db8488', '324921c7b',
        'b43a7cfd5', '9306da53f', 'd6bb78916', 'fb0f5dbfe', '6eef030c1'
    ]

In [36]:
model_xgb = XGBRegressor(colsample_bytree=0.055, colsample_bylevel =0.5,
                    gamma=1.5, learning_rate=0.02, max_depth=32, 
                    objective='reg:linear',booster='gbtree',
                    min_child_weight=57, n_estimators=1000, reg_alpha=0, 
                    reg_lambda = 0,eval_metric = 'rmse', subsample=0.7, 
                    silent=1, n_jobs = -1, early_stopping_rounds = 14,
                    random_state =7, nthread = -1)

In [37]:
model_lgb = LGBMRegressor(
        objective='regression',
        num_leaves= 58,
        subsample= 0.6143,
        colsample_bytree= 0.6453,
        min_split_gain= np.power(10, -2.5988),
        reg_alpha= np.power(10, -2.2887),
        reg_lambda= np.power(10, 1.7570),
        min_child_weight= np.power(10, -0.1477),
        verbose= -1,
        seed= 3,
        boosting_type= 'gbdt',
        max_depth= -1,
        learning_rate= 0.05,
        metric= 'l2')

In [48]:
kf = KFold(n_splits=5, random_state=11, shuffle=True)
scores = []
for train_index, test_index in kf.split(train_feat):
    X_train, X_test = train_feat.iloc[train_index], train_feat.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    model_xgb.fit(X_train, y_train)
    model_lgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_test)
    y_pred_lgb = model_lgb.predict(X_test)
    y_pred = (y_pred_xgb + y_pred_lgb)/2
    scores.append(rmsle(y_test, y_pred))

In [50]:
scores = np.array(scores)

In [51]:
f'{scores.mean()} +- {2*scores.std()}'

'1.3844977831108007 +- 0.052646289332316354'

In [47]:
f'{scores.mean()} +- {2*scores.std()}'

'1.3835389375246958 +- 0.07186779297049803'

In [49]:
model_xgb.fit(train_feat, target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.055, early_stopping_rounds=14,
       eval_metric='rmse', gamma=1.5, learning_rate=0.02, max_delta_step=0,
       max_depth=32, min_child_weight=57, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=-1, objective='reg:linear', random_state=7,
       reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=None, silent=1,
       subsample=0.7)

In [51]:
model_lgb.fit(train_feat, target)

LGBMRegressor(boosting_type='gbdt', class_weight=None,
       colsample_bytree=0.6453, learning_rate=0.05, max_depth=-1,
       metric='l2', min_child_samples=20,
       min_child_weight=0.7117049722679207,
       min_split_gain=0.002518836627841738, n_estimators=100, n_jobs=-1,
       num_leaves=58, objective='regression', random_state=None,
       reg_alpha=0.005143988630287732, reg_lambda=57.14786366718669,
       seed=3, silent=True, subsample=0.6143, subsample_for_bin=200000,
       subsample_freq=0, verbose=-1)

In [53]:
y_pred_x = model_xgb.predict(test_df)
y_pred_l = model_lgb.predict(test_df)

In [67]:
y_pred_x = np.expm1(y_pred_x)
y_pred_l = np.expm1(y_pred_l)

In [68]:
ensemble = (y_pred_x + y_pred_l)/2

In [69]:
ensemble[:10]

array([2459822.92154659, 1907118.94677084, 2015402.57310257,
       5313431.46262712, 1485751.67793284, 2720220.22650466,
       3484125.8825333 , 2778330.75244127, 5440444.97474023,
       2792700.21784403])

In [70]:
sub = pd.DataFrame()
sub['ID'] = test_df.index
sub['target'] = ensemble
sub.to_csv('submission.csv',index=False)

In [71]:
sub.head()

Unnamed: 0,ID,target
0,000137c73,2459823.0
1,00021489f,1907119.0
2,0004d7953,2015403.0
3,00056a333,5313431.0
4,00056d8eb,1485752.0
