## Starter for NYC Housing Prices Competition UTKML 2017

In [239]:
import numpy as np
import pandas as pd

import sklearn
# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Model building
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [240]:
def drop_columns(df, columns):
    """
    Drops unwanted columns.
    
    Parameters:
    * df [dataframe, Pandas object]
    
    * columns [list]
        - columns that you want to drop from df.
    """
    for col in columns:
        df.drop([col], axis=1, inplace=True)


def get_seasons(df):
    """
    Creates a season feature.
    
    Parameters
    ----------
    * df [dataframe, Pandas object]:
    """
    df['date'] = pd.to_datetime(df['date'])
    
    season = []
    for date in df['date']:
        if date.month in [3, 4, 5]:
            season.append('spring')
        elif date.month in [6, 7, 8]:
            season.append('summer')
        elif date.month in [9, 10, 11]:
            season.append('fall')
        else:
            season.append('winter')
    
    df['season'] = season
            

def to_onehot(df, features):
    """
    One hot encoding 
    
    Parameters
    ----------
    * df [dataframe, Pandas]:
    
    * features [list]:
        - df columns you want to one-hot encode.
    """
    le = LabelEncoder()
    for feature in features:
        df[feature] = le.fit_transform(df[feature]) 

    return df

# Creating season feature
get_seasons(train)
get_seasons(test)

# Uncomment the next to print() statements
# to see the how one-hot encoding 
# changes the feature values.

#print(train['season'][0:10])

# One-hot encoding features
features_to_encode = ['season', 'yr_built', 'yr_renovated']
train = to_onehot(train, features_to_encode)
test = to_onehot(test, features_to_encode)

#print(train['season'][0:10])

# Drop unwanted features
cols = ['id', 'date']
drop_columns(train, cols)
drop_columns(test, cols)

### Creating a Validation Set

In [241]:
# Train, validation split
X = train.drop(['price'], axis=1).values
y = train['price'].values

# Create a separate validation set to get a sense for our generalization.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10)

### Random Forest Regressor

In [242]:
reg_rf = RandomForestRegressor()

reg_rf.fit(X_train, y_train)
preds_rf = reg_rf.predict(X_val)

mae_rf = mean_absolute_error(y_pred=preds_rf, y_true=y_val)
print('Random forest MAE = {0:.3f}'.format(mae_rf))

Random forest MAE = 73420.863


### Gradient Boosted Regressor

In [243]:
reg_gbm = GradientBoostingRegressor()

reg_gbm.fit(X_train, y_train)
preds_gbm = reg_gbm.predict(X_val)

mae_gbm = mean_absolute_error(y_pred=preds_gbm, y_true=y_val)
print('Gradient boosted regressor MAE = {0:.3f}'.format(mae_gbm))

Gradient boosted regressor MAE = 74236.599
