In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/santander-value-prediction-challenge/sample_submission.csv
/kaggle/input/santander-value-prediction-challenge/test.csv
/kaggle/input/santander-value-prediction-challenge/train.csv


In [2]:
filepath="/kaggle/input/santander-value-prediction-challenge/"

## Import Libraries

In [3]:
%matplotlib inline

import gc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

In [4]:
traindf = pd.read_csv(filepath+"train.csv")
testdf = pd.read_csv(filepath+"test.csv")
print(traindf.head())
print("-------------------------")
print(testdf.head())

          ID      target  48df886f9  0deb4b6a8  34b15f335  a8cb14b00  \
0  000d6aaf2  38000000.0        0.0          0        0.0          0   
1  000fbd867    600000.0        0.0          0        0.0          0   
2  0027d6b71  10000000.0        0.0          0        0.0          0   
3  0028cbf45   2000000.0        0.0          0        0.0          0   
4  002a68644  14400000.0        0.0          0        0.0          0   

   2f0771a37  30347e683  d08d1fbe3  6ee66e115  ...  3ecc09859  9281abeea  \
0          0          0          0          0  ...        0.0        0.0   
1          0          0          0          0  ...        0.0        0.0   
2          0          0          0          0  ...        0.0        0.0   
3          0          0          0          0  ...        0.0        0.0   
4          0          0          0          0  ...        0.0        0.0   

   8675bec0b  3a13ed79a  f677d4d13  71b203550  137efaa80  fb36b89d9  \
0        0.0          0          0     

In [5]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [6]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


## Check for Missing values

In [7]:
traindf.columns[traindf.isnull().sum() != 0].size

0

In [8]:
testdf.columns[testdf.isnull().sum() != 0].size

0

## Check and remove constant features

In [9]:
colstoremove=[]
for col in traindf.columns:
    if col != 'ID' and col != 'target' :
        if traindf[col].std() == 0:
            colstoremove.append(col)

traindf.drop(colstoremove, axis=1, inplace=True)
testdf.drop(colstoremove, axis=1, inplace=True)

print("Total constant columns removed : ", len(colstoremove))

Total constant columns removed :  256


## Remove duplicate features

In [10]:
%%time

def duplicate_columns(df):
    groups = df.columns.to_series().groupby(df.dtypes).groups
    dups=[]
    
    i=1
    for t,v in groups.items():
        print("i=",i, "----->")
        cs = df[v].columns
        vs = df[v]
        lcs = len(cs)
        #print(vs)
        i += 1
        print("lcs=",lcs)    
        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break
    return dups

dupcols = duplicate_columns(traindf)
print(dupcols)

i= 1 ----->
lcs= 2891
i= 2890 ----->
lcs= 1845
i= 1844 ----->
lcs= 1
['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
CPU times: user 6min 49s, sys: 123 ms, total: 6min 49s
Wall time: 6min 49s


In [11]:
traindf.drop(dupcols, axis=1, inplace=True)
testdf.drop(dupcols, axis=1, inplace=True)
print("Removed duplicated columns: ",dupcols)

Removed duplicated columns:  ['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']


## Drop Sparse Data

In [12]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID', 'target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

In [13]:
%%time
traindf , testdf = drop_sparse(traindf, testdf)

CPU times: user 681 ms, sys: 13 µs, total: 681 ms
Wall time: 680 ms


In [14]:
gc.collect()

print(traindf.shape)
print(testdf.shape)

(4459, 4732)
(49342, 4731)


## Prepare Train and Test data for Model

In [16]:
xtrain = traindf.drop(['ID', 'target'] ,  axis=1)
ytrain = np.log1p(traindf['target'].values)

xtest = testdf.drop(['ID'], axis=1)

print(xtrain.shape, ytrain.shape)
print(xtest.shape)

(4459, 4730) (4459,)
(49342, 4730)


In [17]:
##  split train data into train and validation

xtrain, xval, ytrain, yval = model_selection.train_test_split(xtrain, ytrain, test_size=0.2, random_state=42)

In [18]:
print(xtrain.shape, ytrain.shape)
print(xval.shape, yval.shape)

(3567, 4730) (3567,)
(892, 4730) (892,)


## LightGBM

In [20]:
def run_lgb(xtrain, ytrain, xval, yval, xtest):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed" : 42
    }
    
    lgtrain = lgb.Dataset(xtrain, label=ytrain)
    lgval = lgb.Dataset(xval, label=yval)
    evals_result={}
    model = lgb.train(params, lgtrain, 5000, valid_sets = [lgtrain, lgval], early_stopping_rounds=100, 
                     verbose_eval=150, evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(xtest, num_iteration=model.best_iteration))
    
    return pred_test_y, model, evals_result

In [21]:
pred_test_y , model, evals_result = run_lgb(xtrain, ytrain, xval, yval, xtest)

print("LightGBM model training completed..")

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 1.5082	valid_1's rmse: 1.53919
[300]	training's rmse: 1.34436	valid_1's rmse: 1.46593
[450]	training's rmse: 1.23324	valid_1's rmse: 1.43393
[600]	training's rmse: 1.14931	valid_1's rmse: 1.41848
[750]	training's rmse: 1.08371	valid_1's rmse: 1.41315
[900]	training's rmse: 1.03011	valid_1's rmse: 1.41131
Early stopping, best iteration is:
[934]	training's rmse: 1.01913	valid_1's rmse: 1.41118
LightGBM model training completed..


In [23]:
## feature importance

print("Feature Importance : ")
gain = model.feature_importance('gain')
print("gain : ", gain)
featureimp = pd.DataFrame({'feature': model.feature_name(), 'split':model.feature_importance('split'), 
                          'gain': 100*gain/gain.sum()}).sort_values(by='gain', ascending=False)
print(featureimp[:50])

Feature Importance : 
gain :  [0. 0. 0. ... 0. 0. 0.]
        feature  split      gain
4130  f190486d6    752  8.880463
2375  58e2e02e6    702  5.344246
3465  eeb9cd3aa    677  4.358434
4020  15ace8c9f    542  3.131752
2614  9fd594eec    360  2.882999
8     20aa07010    414  2.151287
3571  58232a6fb    359  1.414442
834   6eef030c1    324  1.369470
1457  b43a7cfd5    406  1.252368
3661  491b9ee45    284  1.047073
2687  fb0f5dbfe    416  1.002580
1482  024c577b9    266  1.001906
4508  c47340d97    326  0.889082
2079  58e056e12    343  0.886709
3867  2288333b4    187  0.857178
4343  1702b5bf0    289  0.853304
566   66ace2992    274  0.833897
4185  f74e8f13d    362  0.820876
4028  5c6487af1    207  0.801488
3791  ed8ff54b5    170  0.792289
828   6786ea46d    179  0.776514
3722  d6bb78916    292  0.747566
3220  ced6a7e91    242  0.703042
3886  50e4f96cf    141  0.692474
863   fc99f9426    234  0.673585
34    87ffda550    172  0.628633
1378  6cf7866c1    164  0.627456
3983  45f6d00da    237

## XGBoost Model

In [24]:
def run_xgb(xtrain, ytrain, xval, yval, xtest):
    params={
        "objective" : "reg:linear",
        "eval_metric" : "rmse" ,
        "eta" : 0.001,
        "max_depth" : 10,
        "subsample" : 0.6,
        "colsample_bytree" : 0.6,
        "alpha" : 0.001,
        "random_state" : 42,
        "silent" : True
    }
    
    trdata = xgb.DMatrix(xtrain, ytrain)
    valdata = xgb.DMatrix(xval, yval)
    
    watchlist = [(trdata, 'train'), (valdata, 'valid')]
    
    model_xgb = xgb.train(params, trdata, 2000, watchlist, maximize=False, early_stopping_rounds=100, 
                         verbose_eval=100)
    
    dtest = xgb.DMatrix(xtest)
    
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [25]:
xgb_pred_y, model_xgb = run_xgb(xtrain, ytrain, xval, yval, xtest)

[0]	train-rmse:14.08765	valid-rmse:14.07678
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.76867	valid-rmse:12.75652
[200]	train-rmse:11.57727	valid-rmse:11.56405
[300]	train-rmse:10.50042	valid-rmse:10.48620
[400]	train-rmse:9.52823	valid-rmse:9.51352
[500]	train-rmse:8.65078	valid-rmse:8.63669
[600]	train-rmse:7.85866	valid-rmse:7.84577
[700]	train-rmse:7.14407	valid-rmse:7.13261
[800]	train-rmse:6.49890	valid-rmse:6.49019
[900]	train-rmse:5.91717	valid-rmse:5.91164
[1000]	train-rmse:5.39212	valid-rmse:5.39099
[1100]	train-rmse:4.91932	valid-rmse:4.92403
[1200]	train-rmse:4.49326	valid-rmse:4.50480
[1300]	train-rmse:4.10994	valid-rmse:4.12945
[1400]	train-rmse:3.76495	valid-rmse:3.79268
[1500]	train-rmse:3.45464	valid-rmse:3.49240
[1600]	train-rmse:3.17614	valid-rmse:3.22519
[1700]	train-rmse:2.92613	valid-rmse:2.98700
[1800]	train-rmse:2.70266	valid-rmse:2.77625
[19

## Catboost

In [27]:
cb_model = CatBoostRegressor(iterations = 500,
                            learning_rate=0.05,
                            depth=10,
                            eval_metric='RMSE',
                            random_seed=42,
                            bagging_temperature=0.2,
                            od_type='Iter',
                            metric_period=50,
                            od_wait=20
                            )

In [29]:
cb_model.fit(xtrain, ytrain, eval_set=(xval, yval), use_best_model=True, verbose=50)



0:	learn: 1.7518683	test: 1.6878429	best: 1.6878429 (0)	total: 1.92s	remaining: 16m
50:	learn: 1.4789181	test: 1.5197196	best: 1.5197196 (50)	total: 1m 26s	remaining: 12m 37s
100:	learn: 1.3788775	test: 1.4780503	best: 1.4780503 (100)	total: 2m 50s	remaining: 11m 13s
150:	learn: 1.3203081	test: 1.4647232	best: 1.4647232 (150)	total: 4m 16s	remaining: 9m 52s
200:	learn: 1.2546056	test: 1.4505032	best: 1.4504536 (198)	total: 5m 41s	remaining: 8m 27s
250:	learn: 1.1817435	test: 1.4381513	best: 1.4381513 (250)	total: 7m 4s	remaining: 7m
300:	learn: 1.1208241	test: 1.4320748	best: 1.4314980 (297)	total: 8m 28s	remaining: 5m 36s
350:	learn: 1.0770301	test: 1.4298988	best: 1.4297384 (347)	total: 9m 52s	remaining: 4m 11s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.428228031
bestIteration = 376

Shrink model to first 377 iterations.


<catboost.core.CatBoostRegressor at 0x7f132eb35b90>

In [30]:
pred_test_cat = np.expm1(cb_model.predict(xtest))

## Combine Predictions

In [35]:
sub_lgb=pd.DataFrame()
sub_lgb['target']=pred_test_y

sub_xgb=pd.DataFrame()
sub_xgb['target']=xgb_pred_y

sub_cat = pd.DataFrame()
sub_cat['target'] = pred_test_cat

sub = pd.read_csv(filepath+'sample_submission.csv')
sub.head()

Unnamed: 0,ID,target
0,000137c73,5944923.0
1,00021489f,5944923.0
2,0004d7953,5944923.0
3,00056a333,5944923.0
4,00056d8eb,5944923.0


In [38]:
sub['target']=sub_lgb['target']*0.5 + sub_xgb['target']*0.3 + sub_cat['target']*0.2

In [39]:
sub.head()

Unnamed: 0,ID,target
0,000137c73,1309503.0
1,00021489f,1346438.0
2,0004d7953,1978322.0
3,00056a333,3799105.0
4,00056d8eb,1447208.0


In [40]:
sub.to_csv('/submission.csv', index=False)