In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import pylab
import seaborn as sn

from scipy.stats import skew
from scipy.stats.stats import pearsonr

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

all_train = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
# Log the output variable
train['SalePrice'] = np.log1p(train['SalePrice'])

# Find all skewed features that is > 0.75 and log them
numeric_features = all_train.dtypes[all_train.dtypes != "object"].index
skewed_features = all_train[numeric_features].apply(lambda x: skew(x.dropna()))
skewed_features = skewed_features[skewed_features > 0.75].index
all_train[skewed_features] = np.log1p(all_train[skewed_features])

# Convert categorical features using dummies
all_train = pd.get_dummies(all_train)

# Fill in missing values with mean
all_train = all_train.fillna(all_train.mean())

# Replace infinity values with 0
all_train = all_train.replace([np.inf, -np.inf], 0)

# Split train/test set back to the way it was.
x_train = all_train[:train.shape[0]]
x_test = all_train[train.shape[0]:]
y_train = train['SalePrice']



In [11]:
from sklearn.ensemble import ExtraTreesRegressor

model_et = ExtraTreesRegressor(n_jobs=16, \
                n_estimators=100, \
                max_features=0.5, \
                max_depth=12, \
                min_samples_leaf=2)
et_score = np.sqrt(-cross_val_score(model_et, x_train, y_train, cv=5, scoring="neg_mean_squared_error"))
print et_score.mean()

# model_et.fit(x_train, y_train)
# et_pred = model_et.predict(x_test)

# Create a et dataframe to see the prediction.
# df_et_pred = pd.DataFrame(np.exp(et_pred), index=test["Id"], columns=["SalePrice"])
# print(df_et_pred.head())

0.142449452984


In [9]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, KFold

# Grid search to find the best n_neighbors for KNN.
parameters = {'n_neighbors':[8, 9, 14, 15, 20, 22]}
neigh = KNeighborsRegressor(weights='distance')
neigh_grid = GridSearchCV(neigh, parameters, cv=5, verbose=0, n_jobs=2, scoring='neg_mean_squared_error')
neigh_grid.fit(x_train, y_train)

score = neigh_grid.cv_results_
print "\nmean_test_score: ", np.sqrt(-score['mean_test_score'])
print "\nParams: ", score['params']

best_param = neigh_grid.best_params_
print "\nBest Parameter is: ", best_param
print "\nBest Score is: ", neigh_grid.best_score_, np.sqrt(-neigh_grid.best_score_)

# Make prediction on the whole test set.
# Use this as input for wrapper (LinearRegression) prediction
neigh_pred = neigh_grid.predict(x_test)


mean_test_score:  [ 0.25711406  0.25741291  0.25509643  0.25519212  0.25592618  0.25588829]

Params:  ({'n_neighbors': 8}, {'n_neighbors': 9}, {'n_neighbors': 14}, {'n_neighbors': 15}, {'n_neighbors': 20}, {'n_neighbors': 22})

Best Parameter is:  {'n_neighbors': 14}

Best Score is:  -0.0650741864116 0.255096425713


In [10]:
# KNN with KFold. To be used for stacking.
folds = KFold(n_splits=3)
x_cv = np.array(x_train)
y_cv = np.array(y_train)

for k, (train_index, validation_index) in enumerate(folds.split(x_train)):
#     print "\nK: ", k, "\nValidation:", validation_index, "\nTRAIN:", train_index
    x_cv_train, x_cv_val = x_cv[train_index], x_cv[validation_index]
    y_cv_train, y_cv_val = y_cv[train_index], y_cv[validation_index]
    
    knn = KNeighborsRegressor(weights='distance', n_neighbors=14)
    knn.fit(x_cv_train, y_cv_train)
    np.savetxt('model/knn_pred_fold_{}.txt'.format(k), np.exp(knn.predict(x_cv_val)))
    np.savetxt('model/knn_test_fold_{}.txt'.format(k), y_cv_val)