In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor


from sklearn.metrics import r2_score

In [25]:
import numpy as np

class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

# Label Encoder

Simply speaking, `LabelEncoder` is used to simplify the continuous feature such as temperature.

### Examples

LabelEncoder can be used to normalize labels.

```python
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6]) 
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])
```

It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.

```Python
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"]) 
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']
```

Besides, there're also `OneHotEncoder` function: `sklearn.preprocessing.OneHotEncoder`, encode categorical integer features using a one-hot aka one-of-K scheme.
```Python
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit([[1],[2],[3],[4]])
ohe.transform([2],[3],[1],[4]).toarray()
>>>[ [0,1,0,0] , [0,0,1,0] , [1,0,0,0] ,[0,0,0,1] ]
```

ref: sklearn.preprocessing.LabelEncoder — scikit-learn 0.18.2 documentation  
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html


In [37]:
############################################
# Load data and encode non-value features  #
############################################
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

############################
# Encode non-value label   #
############################
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# Feature Extraction Using Dimension Reduction

Mainly using these methods below:

* TruncatedSVD (tSVD)
* Principal Component Analysis (PCA)
* Independent Component Correlation Algorithm (ICA)
* Gaussian Random Projection (GRP)
* SparseRandomProjection (SRP)

In [39]:
#####################################
# Make dimension reduction          #
#####################################
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

#############################
#  Parameter initialization #
#############################
n_comp = 12
random_state = 420

########
# tSVD #
########
tsvd = TruncatedSVD(n_components=n_comp, random_state=random_state)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

#######
# PCA #
#######
pca = PCA(n_components=n_comp, random_state=random_state)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

#######
# ICA #
#######
ica = FastICA(n_components=n_comp, random_state=random_state)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

#######
# GRP #
#######
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=random_state)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

#######
# SRP #
#######
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=random_state)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

Add the decomposition components to original features.

In [46]:
#save columns list before adding the decomposition components

original_feats_col_name_list = list(set(train.columns) - set(['y']))

#################################################
# Append decomposition components to datasets   #
#################################################
for i in xrange(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

y_train = train['y'].values
y_train_mean = np.mean(y_train)
id_test = test['ID'].values

#############################################
# finaltrainset and finaltestset are data   #
#     to be used only the stacked model     #
#     (does not contain PCA, SVD... arrays) #
#############################################
finaltrainset = train[original_feats_col_name_list].values
finaltestset = test[original_feats_col_name_list].values

# Train XGBoost model

In [48]:
#######################
# Train the xgb model #
#######################
xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_train_mean, # base prediction = mean(target)
    'silent': 0,
    'early_stopping_rounds': 200,
    'num_boost_rounds': 1250
}
num_boost_rounds = 1250

# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain,
                       num_boost_round=num_boost_rounds)

############################
# Predict the test data    #
#   based on trained model #
############################
y_pred = model.predict(dtest)

# Train stacked models

In [None]:
############################
# Train the stacked models #
############################

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001,
                                                          loss="huber",
                                                          max_depth=3,
                                                          max_features=0.55,
                                                          min_samples_leaf=18,
                                                          min_samples_split=14,
                                                          subsample=0.7),
                     ),
    LassoLarsCV()
)

stacked_pipeline.fit(finaltrainset, y_train)

#########################
# Predict the test data #
#########################

results = stacked_pipeline.predict(finaltestset)

# Result evaluation and result store

In [49]:
#################################
# R2 Score                      #
#     on the entire             #
#     train data when averaging #
#################################

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

###################################
# Average the predition test data # 
#      of both models then save   #
#      it on a csv file           #
###################################

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)


# Any results you write to the current directory are saved as output.

R2 score on train data:
0.659581560761
