## Import Libs

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk 
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.base import TransformerMixin
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
import xgboost as xgb
from sklearn import metrics
from scipy.stats import skew
import keras
from datetime import datetime

%matplotlib inline

## Helper Functions

In [None]:
def convertNA(value):
    try:
        if np.isnan(value):
            return "None"
        else:
            return value
    except:
        return value

In [None]:
def makeSubmission(predictions,testDF):
    submission = pd.DataFrame()
    submission["Id"] = testDF["Id"]
    submission["SalePrice"] = predictions
    submission.to_csv("../assets/submission_stacked_"+ datetime.now().strftime("%Y%m%d-%H%M%S") +".csv", index=False)

In [None]:
def rmsle(y_predict,y_actual):
    return np.sqrt(np.mean(np.power(np.log1p(y_predict) - np.log1p(y_actual), 2)))

## Read, Clean, Create Dummies

In [None]:
train  = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

categories = [u"Alley","Fence","PoolQC","GarageCond","GarageQual","GarageFinish","MiscFeature","GarageType","FireplaceQu","BsmtFinType2","BsmtFinType1","BsmtCond","BsmtQual"]
for column in categories:
    all_data[column] = all_data[column].apply(convertNA)

all_data.fillna(all_data.mean(),inplace=True)

all_data = pd.get_dummies(all_data)

X = all_data.iloc[:1460,:]
X_test = all_data.iloc[1460:,:]
y = train["SalePrice"]

## Create Functions for Each Model Type

### Random Forest

In [None]:
def runRFModel(X,y,X_test):
    cv = KFold(len(y),shuffle=False)
    rf = RandomForestRegressor(random_state=5)
    rfScore = cross_val_score(rf,X,y,cv=cv, n_jobs=-1)
    model = rf.fit(X,y)

    predictions =  model.predict(X)
    print "Score: ", metrics.r2_score(y,predictions)
    print "RMSLE: ", rmsle(y,predictions)

    return predictions,model.predict(X_test)
    

### XGBoost

In [None]:
def modelfit(alg, X, X_test, y, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X.values, label=y.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, y,eval_metric='rmse')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X)
    dtest_predictions = alg.predict(X_test)
    
    #dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    #Print model report:
    #print "\nModel Report"
    #print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['SalePrice'].values.astype(float), dtrain_predictions.astype(float))
    #print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['SalePrice'], dtrain_predprob)
    #print "Score: ", metrics.r2_score(y,dtrain_predictions)
    #print "RMSLE: ", rmsle(y,dtrain_predictions)
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')


In [None]:
def runXGB(X,y,X_test):
    xgb1 = xgb.XGBRegressor(
     learning_rate =0.01,
     n_estimators=1000,
     max_depth=6,
     min_child_weight=3,
     gamma=0,
     subsample=0.65,
     colsample_bytree=0.65,
     objective= 'reg:linear',
     nthread=4,
     scale_pos_weight=1,
     reg_alpha= 1e-05,
     seed=27)
    model = modelfit(xgb1, X, X_test, y)

    predictions =  xgb1.predict(X)
    print "Score: ", metrics.r2_score(y,predictions)
    print "RMSLE: ", rmsle(y,predictions)

    return predictions, xgb1.predict(X_test)

### Linear Regression (Lasso)

In [None]:
def runLinReg(X,y,X_test):
    #log transform the target:
    #y = np.log1p(y)

    #log transform skewed numeric features:
    numeric_feats = X.dtypes[X.dtypes != "object"].index

    skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    
    skewed_feats_test = X_test[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats_test = skewed_feats_test[skewed_feats_test > 0.75]
    skewed_feats_test = skewed_feats_test.index

    X[skewed_feats] = np.log1p(X[skewed_feats])
    X_test[skewed_feats_test] = np.log1p(X_test[skewed_feats_test])
    
    model_lasso = LassoCV(cv=1000,alphas = [100,10,1, 0.1, 0.001, 0.0005], max_iter = 50000, verbose = 1, n_jobs=-1).fit(X, y)
    
    predictions =  model_lasso.predict(X)
    print "Score: ", metrics.r2_score(y,predictions)
    print "RMSLE: ", rmsle(y,predictions)

    return predictions, model_lasso.predict(X_test)

## Build the models and make predictions

In [None]:
rf_predictions,rf_predictions_test = runRFModel(X,y,X_test)

In [None]:
xgb_predictions,xgb_predictions_test = runXGB(X,y,X_test)

In [None]:
lr_predictions,lr_predictions_test = runLinReg(X,y,X_test)

## Save Predictions

In [None]:
level1 = pd.DataFrame()
level1["rf"] = rf_predictions
level1["xgb"] = xgb_predictions
level1["lr"] = lr_predictions
level1["y"] = y
level1.head()

In [None]:
level1_test = pd.DataFrame()
level1_test["rf"] = rf_predictions_test
level1_test["xgb"] = xgb_predictions_test
level1_test["lr"] = lr_predictions_test


In [1]:
level1.to_csv("../assets/Level1.csv",index=False)
level1_test.to_csv("../assets/Level1_test.csv",index=False)

NameError: name 'level1' is not defined

In [6]:
neural = pd.read_csv("../assets/Level1.csv")
neural.head(5)

Unnamed: 0,rf,xgb,lr,y
0,208400.0,204554.875,211728.158201,208500
1,173000.0,176239.09375,197396.677529,181500
2,217000.0,214539.015625,215014.330002,223500
3,142400.0,153358.0625,185803.006791,140000
4,265309.0,272659.0,310234.037357,250000


In [7]:
neural_test = pd.read_csv("../assets/Level1_test.csv")
neural_test.head(5)

Unnamed: 0,rf,xgb,lr
0,126480.0,127040.242188,50689.422559
1,151420.0,156986.4375,87161.067545
2,183290.0,183298.046875,112640.158636
3,184283.2,190673.109375,122685.092072
4,197240.0,189672.546875,175280.182603


In [8]:
net_X = neural.values[:,0:3]
net_Y = neural.values[:,3]

## Construct Neural Net

In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# define base mode
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(3, input_dim=3, init='normal', activation='relu'))
    #model.add(Dense(2, input_dim=2, init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
    # Compile model
    model.compile(loss='msle', optimizer='adam')
    return model

In [10]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=50, batch_size=5, verbose=1)

In [12]:
kfold = KFold(n=len(net_X), n_folds=10, random_state=seed)

## Train Neural Net

In [14]:
results = cross_val_score(estimator, net_X, net_Y, cv=3, n_jobs=1, verbose=1)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min finished


## Train Standardized Neural Net

In [None]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, nb_epoch=150, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n=len(net_X), n_folds=10, random_state=seed)
results = cross_val_score(pipeline, net_X, net_Y, cv=kfold,n_jobs=1)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

## Measure And Predict

In [62]:
model = pipeline.fit_transform(net_X,net_Y)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

AttributeError: 'History' object has no attribute 'transform'

## NN Evaluation

In [64]:
model.score(net_X,net_Y)



50.56294288373973

In [51]:
flat_X = model.predict(net_X).flatten()



In [65]:
rmsle(flat_X,net_Y)

7.1107624687890949

In [53]:
pd.DataFrame({"y_hat":flat_X,"y_actual":net_Y})

Unnamed: 0,y_actual,y_hat
0,208500.0,253919776.0
1,181500.0,222679504.0
2,223500.0,262643744.0
3,140000.0,196645584.0
4,250000.0,345683584.0
5,143000.0,185529344.0
6,307000.0,355179680.0
7,200000.0,255665296.0
8,129900.0,176668256.0
9,118000.0,143800896.0


In [54]:
nnpredictions = model.predict(neural_test.values)



(1459,)

## Make Submission File

In [59]:
makeSubmission(nnpredictions,test)