# House Price Forecasting in Ames, IA
As part of a Kaggle competition, we are going to implement a regression analysis to forecast house prices in Ames, IA.  

## Setting up enviroment
Below we are going to install the necessary packages and download the data needed to build our model

In [8]:
### Import Packages
import numpy as np    #working efficiently with numbers
import pandas as pd    #efficient data structure package
from sklearn.model_selection import ShuffleSplit    #mix our data before splitting into training/testing
from sklearn.metrics import r2_score    #our performance metric for each permutation of GridSearchCV
from sklearn.model_selection import train_test_split    #splitting our data into training/testing sets
from sklearn.preprocessing import Imputer    #assign numeric values to missing values
from sklearn.tree import DecisionTreeRegressor    #building regression from a decision tree
from sklearn.metrics import make_scorer    #make a performance metric
from sklearn.model_selection import GridSearchCV    #assign a dictionary of values that act as a permutation to test

In [9]:
### Import Data
data_train = pd.read_csv('data/train.csv')

### Splitting Features and Target Data
prices = data_train['SalePrice']
features = data_train.drop('SalePrice', axis=1)

In [10]:
### Making Dummy Variable
#Removing categorical data for regression analysis
features = pd.get_dummies(features, dummy_na = True)

### Removing NaN Values from DataFrame
fill_NaN = Imputer(missing_values='NaN', strategy='mean', axis=1)

# Create new Data rame with NaN values replaced with the column mean
features2 = pd.DataFrame(fill_NaN.fit_transform(features))
features2.columns = features.columns
features2.index = features.index

In [11]:
### Splitting into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(features2,
                                                    prices,
                                                    test_size=0.2,
                                                    random_state=0)
# Check for Null/NaN values
# print(X_train.isnull().values.any())
# print(y_train.isnull().values.any())

### Creating Cross-Validation Sets
cv_sets = ShuffleSplit(n_splits=10,
                       test_size=0.2,
                       random_state=0)

In [12]:
### Scoring Function
def performance_metric(y_true, y_predict):
    score = r2_score(y_true, y_predict)
    return score

In [15]:
### Decision Tree Regression
tree = DecisionTreeRegressor(random_state=0)
params = {'max_depth': range(1, 11)}
scoring_fnc = make_scorer(performance_metric)

### GridSearch for Optimal Model
tree_grid = GridSearchCV(estimator=tree,
                    param_grid=params,
                    scoring=scoring_fnc,
                    cv=cv_sets)
tree_grid.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.2, train_size=None),
       error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(performance_metric), verbose=0)

In [14]:
### Making Predictions
tree_pred_train = tree_grid.predict(X_train)
tree_perf_train = performance_metric(y_train, tree_pred_train)
print("The r2-score on the training data was", round(tree_perf_train, 4))
tree_pred_test = tree_grid.predict(X_test)
tree_perf_test = performance_metric(y_test, tree_pred_test)
print("The r2-score on the testing data was", round(tree_perf_test, 4))

('The r2-score on the training data was', 0.9076)
('The r2-score on the testing data was', 0.8241)
