In [1]:
import os
import sys
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics, ensemble
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

In [2]:
data_dir = 'data/'
print("Data Files are: ", os.listdir(data_dir))
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
print("Train DataFrame Shape: ", train.shape)
print("Test DataFrame Shape: ", test.shape)
train.head()

Data Files are:  ['sample_submission.csv.zip', 'test.csv', 'test.csv.zip', 'train.csv', 'train.csv.zip']
Train DataFrame Shape:  (4459, 4993)
Test DataFrame Shape:  (49342, 4992)


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [3]:
RMSLE_important = ['87ffda550', 'c928b4b74', 'ba4ceabc5', '1c71183bb', '9d5c7cb94',
                   '2e103d632', '2288333b4', 'f514fdb2e', 'e1d0e11b5', '0e1f6696a',
                   '63c094ba4', '1184df5c2', 'cbbc9c431', 'bc70cbc26', 'ced6a7e91',
                   '8e4d0fe45', 'e222309b0', 'e078302ef', '50e4f96cf', '0d51722ca']
lgbm_important = ['e222309b0', 'db3839ab0', 'edc84139a', 'a6b6bc34a', '0d51722ca',
                  'b30e932ba', 'fb387ea33', 'f6eba969e', '2288333b4', '342e7eb03',
                  '9306da53f', '26ab20ff9', '0c9462c08', '2e103d632', 'ced6a7e91',
                  '8e4d0fe45', '041c5d0c9', '87ffda550', '6c0e0801a', 'c928b4b74']
chosen_features = list(set(RMSLE_important + lgbm_important))

In [4]:
x_train = train[chosen_features]
y_train = np.log1p(train['target'])
x_test = test[chosen_features]

In [5]:
x_train.head()

Unnamed: 0,b30e932ba,6c0e0801a,cbbc9c431,50e4f96cf,2e103d632,ced6a7e91,2288333b4,87ffda550,e222309b0,ba4ceabc5,...,e078302ef,fb387ea33,342e7eb03,1184df5c2,f514fdb2e,63c094ba4,a6b6bc34a,0c9462c08,9306da53f,f6eba969e
0,0.0,0.0,0.0,0.0,700000.0,0.0,0.0,1300000.0,0.0,0.0,...,1600000.0,0.0,0.0,0.0,0.0,7100000.0,0.0,0.0,0.0,0.0
1,0.0,2225000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4000000.0,5400000.0,1180000.0,0.0


### Preprocess

In [6]:
# Add some stochastic features to dataframe
# calculate mean, sum, var, std, skew, kurtosis ignoring 0s
tmp_train = x_train[x_train != 0]
tmp_test = x_test[x_test != 0]

pd.options.mode.chained_assignment = None
x_train['mean'], x_test['mean'] = tmp_train[chosen_features].mean(axis=1), tmp_test[chosen_features].mean(axis=1)
x_train['sum'], x_test['sum'] = tmp_train[chosen_features].sum(axis=1), tmp_test[chosen_features].sum(axis=1)
x_train['var'], x_test['var'] = tmp_train[chosen_features].var(axis=1), tmp_test[chosen_features].var(axis=1)
x_train['std'], x_test['std'] = tmp_train[chosen_features].std(axis=1), tmp_test[chosen_features].std(axis=1)
x_train['skew'], x_test['skew'] = tmp_train[chosen_features].skew(axis=1), tmp_test[chosen_features].skew(axis=1)
x_train['kurtosis'], x_test['kurtosis'] = tmp_train[chosen_features].kurtosis(axis=1), tmp_test[chosen_features].kurtosis(axis=1)

del(tmp_train)
del(tmp_test)
x_train, x_test = x_train.fillna(0), x_test.fillna(0)
x_train.head()

Unnamed: 0,b30e932ba,6c0e0801a,cbbc9c431,50e4f96cf,2e103d632,ced6a7e91,2288333b4,87ffda550,e222309b0,ba4ceabc5,...,a6b6bc34a,0c9462c08,9306da53f,f6eba969e,mean,sum,var,std,skew,kurtosis
0,0.0,0.0,0.0,0.0,700000.0,0.0,0.0,1300000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2983333.0,17900000.0,9133667000000.0,3022196.0,0.915229,-1.807605
1,0.0,2225000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2679167.0,5358333.34,412534700000.0,642288.7,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4000000.0,4000000.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2000000.0,2000000.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4000000.0,5400000.0,1180000.0,0.0,3526667.0,10580000.0,4620133000000.0,2149450.0,-0.942897,0.0


### Train LightGBM Model

In [7]:
def lgb_model(x_train, x_test, y_train, y_test):
    params = {
        "objective" : "regression",
        "boosting" : "gbdt",
        "metric" : "rmse",
        "num_leaves" : 36,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 6,
        "bagging_seed" : 50,
        "verbosity" : -1,
        "max_depth" : 7
    }
    train = lgb.Dataset(x_train, label=y_train)
    test = lgb.Dataset(x_test, label=y_test)
    model = lgb.train(params, train, 2000, valid_sets=[train, test], early_stopping_rounds=150, verbose_eval=200)
    return model

In [8]:
# Using KFold to train multiple models and average the result
num_kfold = 5
kf = model_selection.KFold(n_splits=num_kfold, shuffle=True, random_state=666)

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = test['ID']
target['target'] = 0.0
for t_idx, v_idx in kf.split(x_train):
    t_x, v_x = x_train.loc[t_idx,:], x_train.loc[v_idx,:]
    t_y, v_y = y_train[t_idx], y_train[v_idx]
    model = lgb_model(t_x, v_x, t_y, v_y)
    target['target'] += np.expm1(model.predict(x_test))

target['target'] /= float(num_kfold)
target.to_csv('submission1.csv', index=False)

Training until validation scores don't improve for 150 rounds.
[200]	training's rmse: 1.43727	valid_1's rmse: 1.50749
[400]	training's rmse: 1.33529	valid_1's rmse: 1.47223
[600]	training's rmse: 1.2866	valid_1's rmse: 1.46902
Early stopping, best iteration is:
[532]	training's rmse: 1.29896	valid_1's rmse: 1.4683
Training until validation scores don't improve for 150 rounds.
[200]	training's rmse: 1.44484	valid_1's rmse: 1.49259
[400]	training's rmse: 1.34567	valid_1's rmse: 1.43977
[600]	training's rmse: 1.2969	valid_1's rmse: 1.42807
[800]	training's rmse: 1.26678	valid_1's rmse: 1.42374
Early stopping, best iteration is:
[803]	training's rmse: 1.26635	valid_1's rmse: 1.42357
Training until validation scores don't improve for 150 rounds.
[200]	training's rmse: 1.43579	valid_1's rmse: 1.52045
[400]	training's rmse: 1.33198	valid_1's rmse: 1.48073
[600]	training's rmse: 1.28327	valid_1's rmse: 1.47589
[800]	training's rmse: 1.25567	valid_1's rmse: 1.47636
Early stopping, best iteratio

### Train XGB Model

In [9]:
def xgb_model(x_train, x_test, y_train, y_test):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 8, 
              'subsample': 0.6,
              'colsample_bytree': 0.7,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}
    train = xgb.DMatrix(x_train, y_train)
    test = xgb.DMatrix(x_test, y_test)
    watchlist = [(train, 'train'), (test, 'valid')]
    model = xgb.train(params, train, 8000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=1000)
    return model

In [10]:
# Using KFold to train multiple models and average the result
num_kfold = 5
kf = model_selection.KFold(n_splits=num_kfold, shuffle=True, random_state=666)

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = test['ID']
target['target'] = 0.0
for t_idx, v_idx in kf.split(x_train):
    t_x, v_x = x_train.loc[t_idx,:], x_train.loc[v_idx,:]
    t_y, v_y = y_train[t_idx], y_train[v_idx]
    model = xgb_model(t_x, v_x, t_y, v_y)
    target['target'] += np.expm1(model.predict(xgb.DMatrix(x_test)))

target['target'] /= float(num_kfold)
target.to_csv('submission2.csv', index=False)

[0]	train-rmse:14.0784	valid-rmse:14.1135
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[1000]	train-rmse:5.37153	valid-rmse:5.39752
[2000]	train-rmse:2.35059	valid-rmse:2.43431
[3000]	train-rmse:1.45239	valid-rmse:1.63873
[4000]	train-rmse:1.22808	valid-rmse:1.492
[5000]	train-rmse:1.15109	valid-rmse:1.47053
Stopping. Best iteration:
[5620]	train-rmse:1.12041	valid-rmse:1.46884

[0]	train-rmse:14.0914	valid-rmse:14.0618
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[1000]	train-rmse:5.37882	valid-rmse:5.3622
[2000]	train-rmse:2.35682	valid-rmse:2.40058
[3000]	train-rmse:1.45963	valid-rmse:1.60354
[4000]	train-rmse:1.23325	valid-rmse:1.45523
[5000]	train-rmse:1.1548	valid-rmse:1.43299
Stopping. Best iteration:
[5680]	train-rmse:1.12085	valid-rmse:1.43043

[0]	train-rmse:14.0977	valid-rmse:

### Train CatBoost Model

In [11]:
def cat_model(x_train, x_test, y_train, y_test):
    model = CatBoostRegressor(iterations=1000,
                              learning_rate=0.02,
                              depth=8,
                              eval_metric='RMSE',
                              random_seed = 66,
                              bagging_temperature = 0.2,
                              od_type='Iter',
                              metric_period = 50,
                              od_wait=20)
    model.fit(x_train, y_train, eval_set=(x_test, y_test), use_best_model=True, verbose=True)
    return model

In [12]:
# Using KFold to train multiple models and average the result
num_kfold = 5
kf = model_selection.KFold(n_splits=num_kfold, shuffle=True, random_state=666)

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = test['ID']
target['target'] = 0
for t_idx, v_idx in kf.split(x_train):
    t_x, v_x = x_train.loc[t_idx,:], x_train.loc[v_idx,:]
    t_y, v_y = y_train[t_idx], y_train[v_idx]
    model = lgb_model(t_x, v_x, t_y, v_y)
    target['target'] += np.expm1(model.predict(x_test))

target['target'] /= float(num_kfold)
target.to_csv('submission3.csv', index=False)

Training until validation scores don't improve for 150 rounds.
[200]	training's rmse: 1.43727	valid_1's rmse: 1.50749
[400]	training's rmse: 1.33529	valid_1's rmse: 1.47223
[600]	training's rmse: 1.2866	valid_1's rmse: 1.46902
Early stopping, best iteration is:
[532]	training's rmse: 1.29896	valid_1's rmse: 1.4683
Training until validation scores don't improve for 150 rounds.
[200]	training's rmse: 1.44484	valid_1's rmse: 1.49259
[400]	training's rmse: 1.34567	valid_1's rmse: 1.43977
[600]	training's rmse: 1.2969	valid_1's rmse: 1.42807
[800]	training's rmse: 1.26678	valid_1's rmse: 1.42374
Early stopping, best iteration is:
[803]	training's rmse: 1.26635	valid_1's rmse: 1.42357
Training until validation scores don't improve for 150 rounds.
[200]	training's rmse: 1.43579	valid_1's rmse: 1.52045
[400]	training's rmse: 1.33198	valid_1's rmse: 1.48073
[600]	training's rmse: 1.28327	valid_1's rmse: 1.47589
[800]	training's rmse: 1.25567	valid_1's rmse: 1.47636
Early stopping, best iteratio

### Average Models' Prediction

In [13]:
lgb_data = pd.read_csv('submission1.csv')
xgb_data = pd.read_csv('submission2.csv')
cat_data = pd.read_csv('submission3.csv')

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = lgb_data['ID']
target['target'] = (lgb_data['target'] + xgb_data['target'] + cat_data['target']) / 3
target.to_csv('submission.csv', index=False)

In [14]:
target.head()

Unnamed: 0,ID,target
0,000137c73,1985677.0
1,00021489f,1985677.0
2,0004d7953,3277046.0
3,00056a333,7564810.0
4,00056d8eb,1985677.0


In [15]:
lgb_data.head()

Unnamed: 0,ID,target
0,000137c73,2004787.0
1,00021489f,2004787.0
2,0004d7953,3185045.0
3,00056a333,7880434.0
4,00056d8eb,2004787.0


In [16]:
xgb_data.head()

Unnamed: 0,ID,target
0,000137c73,1947455.725
1,00021489f,1947455.725
2,0004d7953,3461048.95
3,00056a333,6933561.8
4,00056d8eb,1947455.725


In [17]:
cat_data.head()

Unnamed: 0,ID,target
0,000137c73,2004787.0
1,00021489f,2004787.0
2,0004d7953,3185045.0
3,00056a333,7880434.0
4,00056d8eb,2004787.0
