In [8]:
# PART ONE: Data Reading
import pandas as pd
import numpy as np
from scipy.stats import skew
import matplotlib.pyplot as plt
%matplotlib inline
def read_data():
    #step1:reading csv data
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    #train.head()   # take a brief look at training data
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition'])) # concat training&test data
    return train,test,all_data
def model_input():
    alldata_after_filling_missing_skew = pd.read_csv('../input/alldata_after_filling_missing_skew.csv')
    y = pd.read_csv('../input/train_label_skew',header=None)
    test_label = pd.read_csv('../input/test_id',header=None)
    return alldata_after_filling_missing_skew,y,test_label

In [9]:
'''
the preprocessing apart from dimensional reduction is :
@ missing data filling: 
    RandomForest filling for important features(highly relevant to SalePrice);
    another value filling for features having many missing value, or its missing value has some meaning.
    mean()/mode() filling for other features
@ data transform: get_dummies for categorical features
@ log transform for SalePrice and some skewed features
''' 
#alldata_after_filling_missing_skew = pd.read_csv('../input/alldata_after_filling_missing_skew.csv')
#y = pd.read_csv('../input/train_label_skew',header=None)
#import xgboost as xgb
#model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1)
#model_xgb.fit(alldata_after_filling_missing_skew.iloc[:1460],y)
#pre_val = np.expm1(pd.DataFrame(model_xgb.predict(alldata_after_filling_missing_skew.iloc[1460:])))
#test_label = pd.read_csv('../input/test_id',header=None)
#result = pd.DataFrame()
#result['Id'] = test_label[0]
#result['SalePrice'] = pre_val[0]
#result.to_csv('../input/result_xgb_1123_nofeaselec_skew.csv',index=None)

'\nthe preprocessing apart from dimensional reduction is :\n@ missing data filling: \n    RandomForest filling for important features(highly relevant to SalePrice);\n    another value filling for features having many missing value, or its missing value has some meaning.\n    mean()/mode() filling for other features\n@ data transform: get_dummies for categorical features\n@ log transform for SalePrice and some skewed features\n'

In [4]:
# PART FOUR: Feature Decomposition

from sklearn.decomposition import PCA
# Note that the input df must not have categorial features or missing value,
# do this after preprocessing fo filling missing value and feature transformation
def pca_reduc(df, num_fea_toleave='mle'):
    # @return type: pd.DataFrame
    pca = PCA(n_components=num_fea_toleave)
    after_pca = pca.fit_transform(df)
    print 'Percentage of variance explained by each of the selected components:'
    print(pca.explained_variance_ratio_)
    #print pd.DataFrame(new).info()
    return pd.DataFrame(after_pca)
#all_data = dummy_all(all_data)
#all_data.fillna(all_data.mean(),inplace=True)
#all_data = pca_reduc(all_data,30)
#all_data.info(verbose=True, max_cols=1000)

from sklearn.decomposition import KernelPCA
# Kernel PCA ==> non-linear dimensionality reduction through the use of kernels
# Somewhat like kernel in SVM
# kernel = “linear” | “poly” | “rbf” | “sigmoid” | “cosine” | “precomputed”
def kernelpca_reduc(df, kernel='linear',num_fea_toleave=50):
    kpca = KernelPCA(n_components=num_fea_toleave,kernel = kernel,n_jobs=-1)
    after_kpca = kpca.fit_transform(df)
    print 'the selected features Eigenvalues in decreasing order:'
    print (kpca.lambdas_)
    return pd.DataFrame(after_kpca)
#all_data = dummy_all(all_data)
#all_data.fillna(all_data.mean(),inplace=True)
#all_data = kernelpca_reduc(all_data,kernel='rbf',num_fea_toleave=50)
#print all_data.shape

from sklearn.decomposition import TruncatedSVD
# Dimensionality reduction using truncated SVD
def truncatedSVD_reduc(df,num_fea_toleave=50):
    # provide a random_state to get stable output
    svd = TruncatedSVD(n_components=num_fea_toleave, n_iter=7, random_state=42)
    after_trans = svd.fit_transform(df)
    print 'Percentage of variance explained by each of the selected components:'
    print(svd.explained_variance_ratio_) 
    return pd.DataFrame(after_trans)
#all_data = dummy_all(all_data)
#all_data.fillna(all_data.mean(),inplace=True)
#all_data = truncatedSVD_reduc(all_data,num_fea_toleave=50)
#print all_data.shape

In [5]:
# PART FIVE: Feature Selection

from sklearn.feature_selection import RFECV
# RFECV: Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.
def fea_sel_rfecv(train_x,train_y,test_x,estimator):
    rfecv = RFECV(estimator=estimator,scoring='neg_mean_squared_error',n_jobs=-1)
    after_d = rfecv.fit_transform(train_x,train_y)
    print("Optimal number of features : %d" % rfecv.n_features_)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score(neg_mean_squared_error)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    return pd.DataFrame(after_d),pd.DataFrame(rfecv.transform(test_x))
#alldata_nomissing = pd.read_csv('../input/alldata_after_filling_missing.csv')
#from sklearn import svm
#clf = svm.LinearSVR()
#after_d,after_d_test = (fea_sel_rfecv(alldata_nomissing.iloc[:1460],train['SalePrice'],alldata_nomissing.iloc[1460:],clf))
#print after_d.shape
#print after_d_test.shape

from sklearn.feature_selection import SelectFromModel
# u can see from 'SelectFromModel' that this method use model result to select features, 'Wrapper'
# estimator: a supervised model with fit() method
def fea_sel_tree(train_x,train_y,estimator):
    estimator = estimator.fit(train_x,train_x)
    print 'feature importances in this model',
    print sorted(estimator.feature_importances_,reverse=True)
    model = SelectFromModel(estimator,prefit = True)
    after_sel = model.transform(train_x)
    return pd.DataFrame(after_sel)
#train = dummy_all(train)
#train.fillna(train.mean(),inplace=True)
#from sklearn.ensemble import RandomForestRegressor
#clf = RandomForestRegressor(random_state=0,n_estimators=50)
#print fea_sel_tree(train.iloc[:,1:-1],train['SalePrice'],clf).shape

In [18]:
from sklearn.model_selection import cross_val_score
alldata_after_filling_missing_skew,y,test_label = model_input()
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, alldata_after_filling_missing_skew.iloc[:1460], y, scoring="mean_squared_error", cv = 10))
    return(rmse)

In [10]:
alldata_after_filling_missing_skew,y,test_label = model_input()
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
# the key in the dict of Pipeline is the name u want give to the step
pipe = Pipeline([
        ('reduce_dim',PCA()),
        ('regression',xgb.XGBRegressor())
    ])
# optional for feature nums
N_FEATURES_OPTIONS = [i for i in range(30,250,50)]
#N_ESTIMATOR_OPTIONS = [i for i in range(300,500,20)]
N_ESTIMATOR_OPTIONS = [i for i in range(300,400,100)]
param_grid=[
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regression__n_estimators': N_ESTIMATOR_OPTIONS
    }
]

'''
param_grid=[
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regression__n_estimators': N_ESTIMATOR_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'regression_n_estimators': N_ESTIMATOR_OPTIONS
    }
]
'''
grid = GridSearchCV(pipe, cv=3, n_jobs=-1, param_grid=param_grid)
grid.fit(alldata_after_filling_missing_skew.iloc[:1460],y)
mean_scores = np.array(grid.cv_results_['mean_test_score'])

In [14]:
print N_FEATURES_OPTIONS
print mean_scores

[30, 80, 130, 180, 230]
[ 0.81127826  0.81891798  0.81821421  0.81933439  0.82010839  0.59906961
  0.74412923  0.74475715  0.75498163  0.78805261]


In [None]:
# actually, we can do some operations on features, and get more features to select.
# Its shown that results from features may also work.
# for example, alpha = num_buy/num_click do means something in shopping website's analysis.