In [49]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score



## Define a stacking estimator class ##

In [2]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator
    
    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
        
    def transform(self, X):    
        X = check_array(X)
        X_transformed = np.copy(X)
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))
            
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))
        
        return X_transformed
    


## Read train and test dataset ##

In [25]:
train_org = pd.read_csv('../input/train.csv')
test_org = pd.read_csv('../input/test.csv')

In [26]:
train_org.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


## Data preprocessing 1 ##

In [27]:
# Convert categorical features (Column X0:X8) to label values using LabelEncoder 
# Encode labels with values from n to n_classes - 1

train_label = train_org.copy()
test_label = test_org.copy()
for c in train_label.columns:
    if train_label[c].dtypes == 'object':
        lbe = LabelEncoder()
        lbe.fit(list(train_label[c].values) + list(test_label[c].values))
        train_label[c] = lbe.transform(list(train_label[c].values))
        test_label[c] = lbe.transform(list(test_label[c].values))
train_label.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,24,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,24,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,24,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


## Data preprocessing 2 ##

In [54]:
# Convert categorical features (Column X0:X8) to Dummy Variables using One-Hot_Encoding

temp = pd.concat([train_org, test_org])
temp = pd.get_dummies(temp)
train_dummy = temp.iloc[:4209]
test_dummy = temp.iloc[4209:]

# Since train dataset has 'y' column and test has not, concatenate operation will
# create a new column with values for test data are all none
# When recreate the test dataset, this dummay column needs to be removed
test_dummy.drop('y', axis=1, inplace=True)

len(train_dummy.columns), len(test_dummy.columns)

(581, 580)

In [52]:
train_dummy.head()

Unnamed: 0,ID,X10,X100,X101,X102,X103,X104,X105,X106,X107,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,9,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Magic features ##

## Principle Components Analysis ##

In [59]:
n_comp = 100
pca = PCA(n_components=n_comp, random_state=0)
pca_results_train = pca.fit_transform(train_label.drop(["y"], axis=1))

In [60]:
print (pca.explained_variance_ratio_)

[  9.99904441e-01   4.13130289e-05   2.19765863e-05   1.10193653e-05
   8.26702521e-06   7.62663194e-06   1.42480683e-06   6.67174103e-07
   3.88754081e-07   2.62818901e-07   2.16384043e-07   2.11922104e-07
   1.82366088e-07   1.49327561e-07   1.32521195e-07   1.15136912e-07
   9.27510747e-08   8.58563786e-08   8.01276864e-08   7.04005004e-08
   6.34985956e-08   5.78088419e-08   5.57237058e-08   5.23667240e-08
   4.59750995e-08   4.33226546e-08   3.87329541e-08   3.70671641e-08
   3.38158788e-08   3.30567259e-08   3.18505129e-08   2.95970906e-08
   2.81648193e-08   2.68042845e-08   2.53418096e-08   2.39548190e-08
   2.22288938e-08   1.98302079e-08   1.92309281e-08   1.84941518e-08
   1.75852712e-08   1.66850239e-08   1.59845384e-08   1.57456235e-08
   1.56197611e-08   1.51236570e-08   1.36981701e-08   1.33881360e-08
   1.28714611e-08   1.26702511e-08   1.23608573e-08   1.19063094e-08
   1.15979524e-08   1.12774658e-08   1.06840145e-08   1.02724862e-08
   9.65224332e-09   9.57709662e-09

In [55]:
n_comp = 100
pca = PCA(n_components=n_comp, random_state=0)
pca_results_train = pca.fit_transform(train_dummy.drop(["y"], axis=1))
#pca_results_test = pca.transform(test)

In [58]:
print (pca.explained_variance_ratio_)

[  9.99995165e-01   5.49264529e-07   3.78110298e-07   3.56692281e-07
   2.83501818e-07   2.39659353e-07   2.03258733e-07   1.60171139e-07
   1.36980434e-07   1.21803842e-07   1.04326616e-07   9.96076852e-08
   8.36448133e-08   7.29709057e-08   6.94866327e-08   6.70906353e-08
   6.28543082e-08   5.83899802e-08   5.24797052e-08   4.77214521e-08
   4.41103928e-08   4.27721812e-08   4.08553680e-08   3.94687398e-08
   3.74686362e-08   3.60459182e-08   3.37613568e-08   3.35192909e-08
   3.18376414e-08   3.07946877e-08   3.05054543e-08   2.79396844e-08
   2.66801880e-08   2.45814281e-08   2.33425062e-08   2.29827613e-08
   2.14132603e-08   2.09425133e-08   2.02907014e-08   1.98762468e-08
   1.92989466e-08   1.83700477e-08   1.83100208e-08   1.80186827e-08
   1.69906541e-08   1.68266114e-08   1.61996124e-08   1.55282489e-08
   1.51912792e-08   1.44586887e-08   1.41415892e-08   1.33378191e-08
   1.30928544e-08   1.29526570e-08   1.24308948e-08   1.22109410e-08
   1.18257216e-08   1.16250245e-08

## Cross Validation and Grid Search ##