In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.cross_validation import ShuffleSplit

# Import supplementary visualizations code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

# Load the Boston housing dataset
data = pd.read_csv('housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
    
# Success
print "Boston housing dataset has {} data points with {} variables each.".format(*data.shape)

Boston housing dataset has 489 data points with 4 variables each.


In [2]:
data.head()

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


In [9]:
np.min(data['MEDV'].values)

105000.0

In [11]:
from sklearn.metrics import r2_score

In [13]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

In [14]:
# Calculate the performance of this model
score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print "Model has a coefficient of determination, R^2, of {:.3f}.".format(score)

Model has a coefficient of determination, R^2, of 0.923.


In [16]:
X=data.drop(['MEDV'], axis=1)
X.head()

Unnamed: 0,RM,LSTAT,PTRATIO
0,6.575,4.98,15.3
1,6.421,9.14,17.8
2,7.185,4.03,17.8
3,6.998,2.94,18.7
4,7.147,5.33,18.7


In [17]:
from sklearn.cross_validation import train_test_split

# seperating label from the features 
y=data.MEDV
X=data.drop(['MEDV'], axis=1)

# perform train and test data split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0) 



In [20]:
print(X_train.shape)
print(X_test.shape)

(391, 3)
(98, 3)


In [32]:
max_depths_choices=np.arange(10)+1
max_depths_choices

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [37]:
#Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import explained_variance_score, make_scorer
from sklearn.grid_search import GridSearchCV

def fit_model(X, y, verbose=0):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()
    
    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    max_depths_choices=np.arange(10)+1
    params = {'max_depth':max_depths_choices}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search object
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets, verbose=verbose)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid, grid.best_estimator_

In [43]:
grid, reg = fit_model(X_train, y_train,2)

# Produce the value for 'max_depth'
print "Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth'])

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] max_depth=1 .....................................................
[CV] ............................................ max_depth=1 -   0.0s
[CV] max_depth=1 .....................................................
[CV] ............................................ max_depth=1 -   0.0s
[CV] max_depth=1 .....................................................
[CV] ............................................ max_depth=1 -   0.0s
[CV] max_depth=1 .....................................................
[CV] ............................................ max_depth=1 -   0.0s
[CV] max_depth=1 .....................................................
[CV] ............................................ max_depth=1 -   0.0s
[CV] max_depth=1 .....................................................
[CV] ............................................ max_depth=1 -   0.0s
[CV] max_depth=1 .....................................................
[CV] .........

[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


In [35]:
grid.grid_scores_

[mean: 0.39213, std: 0.09209, params: {'max_depth': 1},
 mean: 0.63487, std: 0.08254, params: {'max_depth': 2},
 mean: 0.74577, std: 0.06170, params: {'max_depth': 3},
 mean: 0.77496, std: 0.06050, params: {'max_depth': 4},
 mean: 0.76891, std: 0.05518, params: {'max_depth': 5},
 mean: 0.75026, std: 0.05216, params: {'max_depth': 6},
 mean: 0.74604, std: 0.05584, params: {'max_depth': 7},
 mean: 0.72987, std: 0.06332, params: {'max_depth': 8},
 mean: 0.74248, std: 0.06741, params: {'max_depth': 9},
 mean: 0.71556, std: 0.06247, params: {'max_depth': 10}]

In [45]:
grid, reg = fit_model(X, y)
grid.grid_scores_

[mean: 0.39603, std: 0.04104, params: {'max_depth': 1},
 mean: 0.66550, std: 0.03567, params: {'max_depth': 2},
 mean: 0.76373, std: 0.03430, params: {'max_depth': 3},
 mean: 0.79657, std: 0.02893, params: {'max_depth': 4},
 mean: 0.77304, std: 0.04639, params: {'max_depth': 5},
 mean: 0.73983, std: 0.05829, params: {'max_depth': 6},
 mean: 0.72228, std: 0.05612, params: {'max_depth': 7},
 mean: 0.70849, std: 0.05298, params: {'max_depth': 8},
 mean: 0.67368, std: 0.06869, params: {'max_depth': 9},
 mean: 0.66803, std: 0.06505, params: {'max_depth': 10}]

In [50]:
# Minimum price of the data
minimum_price = np.min(data.MEDV)

# Maximum price of the data
maximum_price =np.max(data.MEDV)

# Mean price of the data
mean_price = np.mean(data.MEDV)

# Median price of the data
median_price = np.median(data.MEDV)

# Standard deviation of prices of the data
std_price = np.std(data.MEDV)

# Show the calculated statistics
print "Statistics for Boston housing dataset:\n"
print "Minimum price: ${:,.2f}".format(minimum_price)
print "Maximum price: ${:,.2f}".format(maximum_price)
print "Mean price: ${:,.2f}".format(mean_price)
print "Median price ${:,.2f}".format(median_price)
print "Standard deviation of prices: ${:,.2f}".format(std_price)


#additional exploration to see what the features are like for min, max and medium value
#additional exploration to see what the features are like for min, max and medium value
print "average features of houses with minimum value"
print data[data.MEDV==105000].mean()

print "average features of houses with maximum value"
print data[data.MEDV==1024800].mean()

print "average features of houses with median value"
print data[data.MEDV==438900].mean()

Statistics for Boston housing dataset:

Minimum price: $105,000.00
Maximum price: $1,024,800.00
Mean price: $454,342.94
Median price $438,900.00
Standard deviation of prices: $165,171.13
average features of houses with minimum value
RM              5.568
LSTAT          26.785
PTRATIO        20.200
MEDV       105000.000
dtype: float64
average features of houses with maximum value
RM               8.398
LSTAT            5.910
PTRATIO         13.000
MEDV       1024800.000
dtype: float64
average features of houses with median value
RM              5.8805
LSTAT           9.0200
PTRATIO        17.6500
MEDV       438900.0000
dtype: float64


In [48]:
data.head()

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0


In [51]:
data.shape

(489, 4)

**Answer: **

* **RM**: Increase in the value of 'RM' would lead to an increase of the 'MEDV' value. House prices tend to be higher when there are more room in a house

* **LSTAT**: Increase in the value of 'LSTAT' would lead to a decrease of the 'MEDV' value. An area with a poorer population tends to have lower house price - people sometimes perceive poorer area to be associated with less social stability, and higher crime rate. These perceptions tend to have a negative impact on house demand on such areas - especially from people with higher income who can afford to pay a higher price. 

* **PTRATIO**: Increase in the value of ＇PTRATIO' would lead to a decrease of the 'MEDV' value. Higher 'PTRATIO' value indicates that on average a teacher has to cater for more students in a school. There is a general perception that the quality of education suffers when a teacher has to teach a larger sized group of student. For this reason, the 'PTRATIO' feature could be perceived as an indicator of the school quality in an area: higher 'PTRATIO' value indicating lower school quality, while lower 'PTRATIO' value indicating higher school quality. Therefore, there is a tendency that area with higher 'PTRATIO' would have less house demand and therefore with cheaper house price which is indicated by the 'MEDV' value 