In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score

## Define a stacking estimator class ##

In [2]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator
    
    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
        
    def transform(self, X):    
        X = check_array(X)
        X_transformed = np.copy(X)
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))
            
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))
        
        return X_transformed
    


## Read train and test dataset ##

In [3]:
train_org = pd.read_csv('../input/train.csv')
test_org = pd.read_csv('../input/test.csv')

In [4]:
train_org.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


## Data preprocessing 1 ##

In [5]:
# Convert categorical features (Column X0:X8) to label values using LabelEncoder 
# Encode labels with values from n to n_classes - 1

train_label = train_org.copy()
test_label = test_org.copy()
for c in train_label.columns:
    if train_label[c].dtypes == 'object':
        lbe = LabelEncoder()
        lbe.fit(list(train_label[c].values) + list(test_label[c].values))
        train_label[c] = lbe.transform(list(train_label[c].values))
        test_label[c] = lbe.transform(list(test_label[c].values))
train_label.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,24,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,24,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,24,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


## Data preprocessing 2 ##

In [6]:
# Convert categorical features (Column X0:X8) to Dummy Variables using One-Hot_Encoding

temp = pd.concat([train_org, test_org])
temp = pd.get_dummies(temp)
train_dummy = temp.iloc[:4209]
test_dummy = temp.iloc[4209:]

# Since train dataset has 'y' column and test has not, concatenate operation will
# create a new column with values for test data are all none
# When recreate the test dataset, this dummay column needs to be removed
test_dummy.drop('y', axis=1, inplace=True)

len(train_dummy.columns), len(test_dummy.columns)

(581, 580)

In [7]:
train_dummy.head()

Unnamed: 0,ID,X10,X100,X101,X102,X103,X104,X105,X106,X107,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,9,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Assessing feature importance with random forests ##

In [8]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [9]:
rf = RandomForestRegressor(max_features=100, max_depth=1000)
#rf = RandomForestClassifier()
#rf.fit(train_org.drop(['ID','y'], axis=1), train_org['y'])
rf.fit(train_dummy.drop(['ID','y'], axis=1), train_dummy['y'])
#print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), train_dummy.drop(['ID','y'], axis=1).columns), reverse=True)
#len(rf.feature_importances_)
#importances = rf.feature_importances_
#indices = np.argsort(importances)[::-1]

[(0.1455, 'X127'), (0.0846, 'X261'), (0.0592, 'X314'), (0.0342, 'X263'), (0.0288, 'X118'), (0.0271, 'X232'), (0.0234, 'X315'), (0.0229, 'X0_az'), (0.0221, 'X316'), (0.0138, 'X162'), (0.0127, 'X311'), (0.0126, 'X272'), (0.0124, 'X166'), (0.0086, 'X54'), (0.0082, 'X324'), (0.0081, 'X6_g'), (0.008, 'X2_ai'), (0.0077, 'X5_ag'), (0.0073, 'X178'), (0.0071, 'X119'), (0.0068, 'X3_c'), (0.0067, 'X273'), (0.0065, 'X250'), (0.0065, 'X115'), (0.006, 'X6_j'), (0.0056, 'X234'), (0.0055, 'X275'), (0.0054, 'X8_g'), (0.0052, 'X6_i'), (0.0051, 'X5_m'), (0.0051, 'X313'), (0.005, 'X158'), (0.0049, 'X348'), (0.0047, 'X5_ab'), (0.0046, 'X5_p'), (0.0046, 'X5_n'), (0.0045, 'X27'), (0.0044, 'X136'), (0.0042, 'X5_q'), (0.0041, 'X355'), (0.0041, 'X0_y'), (0.004, 'X8_t'), (0.0035, 'X5_r'), (0.0034, 'X6_h'), (0.0032, 'X5_v'), (0.0032, 'X5_af'), (0.0032, 'X1_r'), (0.0031, 'X6_a'), (0.0031, 'X354'), (0.003, 'X5_k'), (0.003, 'X374'), (0.0029, 'X6_d'), (0.0029, 'X5_ae'), (0.0029, 'X58'), (0.0029, 'X267'), (0.0028, 'X5

## Magic features ##

In [10]:
#

## Principle Components Analysis ##

In [11]:
n_comp = 100
cutoff = 0.98
pca_label = PCA(n_components=cutoff, random_state=0)
pca_label_train = pca_label.fit_transform(train_label.drop(["ID", "y"], axis=1))
pca_label_test = pca_label.transform(test_label.drop(["ID"], axis=1))

In [12]:
print ("Number of components: ", len(pca_label.explained_variance_ratio_))
print ([round(p, 6) for p in pca_label.explained_variance_ratio_])
# The first components explains 0.40869 variances? 
# 12 components explains 98% variance
# First components explains 41% variance

('Number of components: ', 12)
[0.40869, 0.217585, 0.131201, 0.107835, 0.081652, 0.014093, 0.00661, 0.003847, 0.002603, 0.002144, 0.002099, 0.001804]


In [13]:
n_comp = 100
cutoff = 0.98
pca_dummy = PCA(n_components=cutoff, random_state=0)
pca_dummy_train = pca_dummy.fit_transform(train_dummy.drop(["ID", "y"], axis=1))
pca_dummy_test = pca_dummy.transform(test_dummy.drop(["ID"], axis=1))

In [14]:
print ("Number of components: ", len(pca_dummy.explained_variance_ratio_))
print ([round(p, 6) for p in pca_dummy.explained_variance_ratio_])
#185 components explains 98% variance
#The first components explains 11% variances? 

('Number of components: ', 185)
[0.113279, 0.077991, 0.073582, 0.058481, 0.049431, 0.041919, 0.0331, 0.028273, 0.025155, 0.021535, 0.020776, 0.017251, 0.015053, 0.014352, 0.013852, 0.012968, 0.012055, 0.010929, 0.009842, 0.009132, 0.008834, 0.008438, 0.008232, 0.007727, 0.007434, 0.006974, 0.006934, 0.006573, 0.006387, 0.006296, 0.005764, 0.005546, 0.005207, 0.004815, 0.004741, 0.004424, 0.004369, 0.004199, 0.0041, 0.004042, 0.003789, 0.003777, 0.003719, 0.00351, 0.003471, 0.003346, 0.003219, 0.003134, 0.003015, 0.002917, 0.002754, 0.002718, 0.002675, 0.002582, 0.002525, 0.002443, 0.002408, 0.002387, 0.002289, 0.002254, 0.002219, 0.002174, 0.002135, 0.002088, 0.002054, 0.002043, 0.00202, 0.001986, 0.001969, 0.001959, 0.001945, 0.001926, 0.001897, 0.00188, 0.001851, 0.001823, 0.001812, 0.001773, 0.001764, 0.001755, 0.001737, 0.001718, 0.001712, 0.001692, 0.00168, 0.001653, 0.001637, 0.001623, 0.001615, 0.00159, 0.001579, 0.001574, 0.001539, 0.001525, 0.001514, 0.001512, 0.001486, 0.0014

## Cross Validation and Grid Search ##

In [15]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline


