In [66]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk 
import sqlalchemy
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.base import TransformerMixin
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split, cross_val_score, KFold

%matplotlib inline

In [67]:
def convertNA(alleys):
    try:
        if np.isnan(alleys):
            #print "Yes"
            return "None"
        else:
            #print "No"
            return alleys
    except:
        #print "No"
        return alleys

In [68]:
train  = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")

del train["Id"]
testids = test["Id"]
del test["Id"]

fixColumns = ["Alley","Fence","PoolQC","GarageCond","GarageQual","GarageFinish","MiscFeature","GarageType","FireplaceQu","BsmtFinType2","BsmtFinType1","BsmtCond","BsmtQual"]
for column in fixColumns:
    train[column] = train[column].apply(convertNA)
for column in fixColumns:
    test[column] = test[column].apply(convertNA)

#train.dropna(inplace=True)
#test.dropna(inplace=True)

train.fillna(train.median(),inplace=True)
test.fillna(test.median(),inplace=True)

y = train["SalePrice"]
del train["SalePrice"]
X = train
X_test = test


In [69]:
class MultiColumnLabelEncoder(TransformerMixin):
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = pd.DataFrame(X,columns=self.columns)
        #print output.head()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                if(output[colname].dtype == "object"):
                    output[colname] = LabelEncoder().fit_transform(col)
                else:
                    output[colname] = col
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [70]:
mcle = MultiColumnLabelEncoder()
#ss = StandardScaler()
lr = LinearRegression()

#pipe = Pipeline([('mcle',mcle),('ss', ss)])

In [71]:
output = mcle.fit_transform(X)
#output
#output = ss.fit_transform(output)
model = lr.fit(output,y)
predictions = model.predict(X)
model.score(X,y)

0.85276108667325767

In [72]:
ouput = mcle.fit_transform(X_test)

In [73]:
dtr = DecisionTreeRegressor()
model = dtr.fit(X,y)
predictions = model.predict(ouput)

In [74]:
submission = pd.DataFrame()

In [75]:
submission["Id"] = testids
submission["SalePrice"] = predictions

In [76]:
#submission.to_csv("../assets/submission.csv", index=False)

In [77]:
cv = KFold(len(y),shuffle=False)

dt = DecisionTreeRegressor(random_state=5)
dtScore = cross_val_score(dt,X,y, cv=cv, n_jobs=-1)
dtScore

array([ 0.75770396,  0.76075465,  0.50867169])

In [78]:
rf = RandomForestRegressor(random_state=5)
rfScore = cross_val_score(rf,X,y,cv=cv, n_jobs=-1)
rfScore

array([ 0.87069465,  0.83087723,  0.78898997])

In [79]:
ab = AdaBoostRegressor(random_state=5)
abScore = cross_val_score(ab,X,y,cv=cv,n_jobs=-1)
abScore

array([ 0.8489448 ,  0.80014323,  0.73189558])

In [80]:
gb = GradientBoostingRegressor(random_state=5)
gbScore = cross_val_score(ab,X,y,cv=cv,n_jobs=-1)
gbScore

array([ 0.8489448 ,  0.80014323,  0.73189558])

In [81]:
print "Decision Tree Score: %f" % dtScore.mean()
print "Random Forest Score: %f" % rfScore.mean()
print "AdaBoost Score: %f" % abScore.mean()
print "Gradient Boost Score: %f" % gbScore.mean()

Decision Tree Score: 0.675710
Random Forest Score: 0.830187
AdaBoost Score: 0.793661
Gradient Boost Score: 0.793661


### fillna median
1. Decision Tree Score: 0.675710
2. Random Forest Score: 0.830187
3. AdaBoost Score: 0.793661
4. Gradient Boost Score: 0.793661

### fillna mean
1. Decision Tree Score: 0.704135
2. Random Forest Score: 0.840518
3. AdaBoost Score: 0.795396
4. Gradient Boost Score: 0.795396

In [82]:
model = rf.fit(X,y)
predictions = model.predict(ouput)

In [83]:
submission["Id"] = testids
submission["SalePrice"] = predictions

In [84]:
#submission.to_csv("../assets/submission.csv", index=False)

In [101]:
PARAMETERS = {'max_depth':[1,2,3,4,5,6], 'max_features':range(4,20), 'max_leaf_nodes':range(9,20), 'min_samples_leaf':[1,2,3,4],'min_samples_split':range(4,20)}

In [102]:
SCORING = 'mean_squared_error'

In [None]:
model = DecisionTreeRegressor()
clf = GridSearchCV(rf, PARAMETERS, scoring=SCORING, verbose=True, n_jobs=-1)
clf.fit(X,y)

print clf.best_estimator_
print clf.best_score_
print np.sqrt(-clf.best_score_)

Fitting 3 folds for each of 67584 candidates, totalling 202752 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 2476 tasks      | elapsed:   49.2s


In [96]:
rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features=4, max_leaf_nodes=9, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=5,
           verbose=0, warm_start=False)

In [98]:
model = rf.fit(X,y)
print model.score(X,y)
predictions = model.predict(ouput)
submission.to_csv("../assets/submission.csv", index=False)

0.731207727182
