In [3]:
# Import Libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from scipy.stats import skew

import warnings
warnings.filterwarnings('ignore') 


# sk learn import 
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
pd.set_option('display.max_columns', None) # display all columns



In [4]:
#Import Data
df_train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\Allstate\train.csv")
df_test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\Allstate\test.csv")
print ('data loaded')
print (str(len(df_train))+" rows for training set")
print (str(len(df_test))+" rows for test set")

data loaded
188318 rows for training set
125546 rows for test set


### Define Median Absolute Deviation Function

In [5]:
def is_outlier(points, thresh = 3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

### Remove Skew from SalesPrice data as required by the competition
Select the last column as target

In [6]:
target = df_train[df_train.columns.values[-1]]
target_log = (target)

### Merge Train and Test to evaluate ranges and missing values excluding the last column
This was done primarily to ensure that Categorical data in the training and testing data sets were consistent.

In [7]:
df_train = df_train[df_train.columns.values[:-1]]
df = df_train.append(df_test, ignore_index = True)

### Find all categorical data

In [8]:
cats = []
for col in df.columns.values:
    if df[col].dtype == 'object':
        cats.append(col)

### Create separte datasets for Continuous vs Categorical

In [9]:
df_cont = df.drop(cats, axis=1)
df_cat = df[cats]

### Handle Missing Data for continuous data
1. If any column contains more than 50 entries of missing data, drop the column
2. If any column contains fewer that 50 entries of missing data, replace those missing values with the median for that column (the median imputation used on missing values is very crude. For example, Area features with missing values may be this way because the property does not have that feature (e.g. a pool) so it would make more sense to set this to zero. )
3. Remove outliers using Median Absolute Deviation
4. Calculate skewness for each variable and if greater than 0.75 transform it
5. Apply the sklearn.Normalizer to each column

In [10]:
for col in df_cont.columns.values:
    if np.sum(df_cont[col].isnull()) > 50:
        #print("Removing Column: {}".format(col))
        df_cont = df_cont.drop(col, axis = 1)
    elif np.sum(df_cont[col].isnull()) > 0:
        #print("Replacing with Median: {}".format(col))
        median = df_cont[col].median()
        idx = np.where(df_cont[col].isnull())[0]
        df_cont[col].iloc[idx] = median
        
        
        outliers = np.where(is_outlier(df_cont[col]))
        df_cont[col].iloc[outliers] = median
        
               
        if skew(df_cont[col]) > 0.75:
            #print("Skewness Detected: {}".format(col))
            df_cont[col] = np.log(df_cont[col])
            df_cont[col] = df_cont[col].apply(lambda x: 0 if x == -np.inf else x)
        
        df_cont[col] = Normalizer().fit_transform(df_cont[col].reshape(1,-1))[0]
        

### Handle Missing Data for Categorical Data
1. If any column contains more than 50 entries of missing data, drop the column
2. If any column contains fewer that 50 entries of missing data, replace those values with the 'MIA'
3. Apply the sklearn.LabelEncoder
4. For each categorical variable determine the number of unique values and for each, create a new column that is binary

In [11]:
for col in df_cat.columns.values:
    if np.sum(df_cat[col].isnull()) > 50:
        df_cat = df_cat.drop(col, axis = 1)
        continue
    elif np.sum(df_cat[col].isnull()) > 0:
        df_cat[col] = df_cat[col].fillna('MIA')
        
    df_cat[col] = LabelEncoder().fit_transform(df_cat[col])
    
    num_cols = df_cat[col].max()
    for i in range(num_cols):
        col_name = col + '_' + str(i)
        df_cat[col_name] = df_cat[col].apply(lambda x: 1 if x == i else 0)
        
    df_cat = df_cat.drop(col, axis = 1)
    
   

### Merge Numeric and Categorical Datasets and Create Training and Testing Data

In [12]:
df_new = df_cont.join(df_cat)

df_train = df_new.iloc[:len(df_train) - 1]
df_train = df_train.join(target_log)

df_test = df_new.iloc[len(df_train) + 1:]

X_train = df_train[df_train.columns.values[1:-1]]
y_train = df_train[df_train.columns.values[-1]]

X_test = df_test[df_test.columns.values[1:]]

#### Print the length for checking

In [11]:
print (str(len(y_train))+" rows for training set")
print (str(len(X_train))+" rows for test set")

1459 rows for training set
1459 rows for test set


## Simple Linear Regression

In [12]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print ('R-squared: %.4f' % model.score(X_train, y_train))

R-squared: 0.9382


### Evaluate Algorithms

In [13]:
seed = 7
processors=1
num_folds=3
num_instances=len(X_train)
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better = False)

kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)

### Algorithms spot-check

In [37]:
# Prepare some basic models
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
models = []
#Linear Regression without regularization
models.append(('LR', linear_model.LinearRegression()))
#Linear Regression with Ridge regularization (L2 penalty)
models.append(('Ridge', linear_model.RidgeCV()))
#Linear Regression with Lasso regularization 
models.append(('LassoCV', linear_model.LassoCV()))
#Linear Regression with ElasticNet regularization (L1 and L2 penalty)
models.append(('ElasticNetCV', linear_model.ElasticNetCV()))
#Lasso Lars
models.append(('LassoLars', linear_model.LassoLarsCV()))
#Stochastic Gradient Descent
models.append(('SGD', linear_model.SGDRegressor()))
#Bayesian Ridge Regression
models.append(('Bayesian Ridge Regression', linear_model.BayesianRidge()))
# Robustly fit linear model with RANSAC algorithm
models.append(('RANSAC', linear_model.RANSACRegressor(linear_model.LinearRegression())))

#DecisionTreeRegressor
models.append(('DTR', DecisionTreeRegressor(max_depth=4)))

#AdaBoostRegressor
models.append(('ABR', AdaBoostRegressor(DecisionTreeRegressor(max_depth=4))))

#Kneighbors Regressor
models.append(('K-nn', KNeighborsRegressor()))

#Extra Forest Regressor
models.append(('ETR', ExtraTreesRegressor(n_estimators=10)))
#Random Forest Regressor
models.append(('RFR', RandomForestRegressor()))

#Gradient Boosting Regressor
models.append(('GBR', GradientBoostingRegressor()))

#XGB Regressor
models.append(('XGBR', xgb.XGBRegressor()))


# Evaluate each model in turn
results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scorer, n_jobs=processors)
    results.append(cv_results)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))

LR: (-0.020) +/- (0.004)
Ridge: (-0.022) +/- (0.003)
LassoCV: (-0.044) +/- (0.008)
ElasticNetCV: (-0.044) +/- (0.008)
LassoLars: (-0.018) +/- (0.003)
SGD: (-103618737296652107791776291051536384.000) +/- (141140303960248657950612947344556032.000)
Bayesian Ridge Regression: (-0.022) +/- (0.003)
RANSAC: (-0.026) +/- (0.003)
DTR: (-0.045) +/- (0.002)
ABR: (-0.026) +/- (0.002)
K-nn: (-0.067) +/- (0.004)
ETR: (-0.024) +/- (0.003)
RFR: (-0.024) +/- (0.002)
GBR: (-0.018) +/- (0.003)
XGBR: (-0.018) +/- (0.003)


##### Random forest Regressor is providing the best performance

### Hyperparameter tuning

#### Grid Search Parameter Tuning

Grid search is an approach to parameter tuning that will methodically build and evaluate a model for each combination of algorithm parameters specified in a grid.

In [40]:
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
lr_grid = GridSearchCV(
    estimator = linear_model.Ridge(),
    param_grid = dict(alpha=alphas), 
    cv = kfold, 
    scoring = scorer, 
    n_jobs = processors)

lr_grid.fit(X_train, y_train)

print(lr_grid.best_score_)
print(lr_grid.best_params_)

-0.0197614612153116
{'alpha': 0.0001}


#### Random Search Parameter Tuning

Random search is an approach to parameter tuning that will sample algorithm parameters from a random distribution (i.e. uniform) for a fixed number of iterations. A model is constructed and evaluated for each combination of parameters chosen.

In [23]:
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import uniform as sp_rand
param_grid = {'alpha': sp_rand()}
lr_grid = RandomizedSearchCV(
    estimator = linear_model.Ridge(),
    param_distributions=param_grid, 
    cv = kfold, 
    scoring = scorer, 
    n_jobs = processors)

lr_grid.fit(X_train, y_train)

print(lr_grid.best_score_)
print(lr_grid.best_params_)

-0.021673261855508927
{'alpha': 0.068889665481824847}


In [27]:
#Random Forest
rf_grid = GridSearchCV(
    estimator = RandomForestRegressor(warm_start=True, random_state=seed),
    param_grid = {
        'n_estimators': [100, 200],
        'criterion': ['mse'],
        'max_features': [18, 20],
        'max_depth': [8, 10],
        'bootstrap': [True]
    }, 
    cv = kfold, 
    scoring = scorer, 
    n_jobs = processors)

rf_grid.fit(X_train, y_train)

print(rf_grid.best_score_)
print(rf_grid.best_params_)

-0.021859409143386545
{'criterion': 'mse', 'max_depth': 10, 'max_features': 18, 'n_estimators': 200, 'bootstrap': True}


#### XGBoost while calculating best metaparameters

In [18]:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python

parameters_for_testing = {
    'colsample_bytree':[0.4],
    'gamma':[0.03],
    'min_child_weight':[1.5],
    'learning_rate':[0.07],
    'max_depth':[3],
    'n_estimators':[10000],
    'reg_alpha':[0.75],
    'reg_lambda':[0.45],
    'subsample':[0.95]
   
}

train_x= X_train
train_y= y_train
#train_dataset.drop("SalePrice", axis=1, inplace=True)
                    
#xgb_model = xgboost.XGBRegressor()
#gsearch1 = GridSearchCV(estimator = xgb_model, param_grid = parameters_for_testing, n_jobs=4,iid=False, cv=5)
#gsearch1.fit(train_x,train_y)
#gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

final_parameters = {
    'colsample_bytree':[0.4],
    'gamma':[0.03],
    'min_child_weight':[1.5],
    'learning_rate':[0.07],
    'max_depth':[3],
    'n_estimators':[1000],
    'reg_alpha':[0.75],
    'reg_lambda':[0.45],
    'subsample':[0.95]
   
}

best_xgb_model = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0.030,                 
                 learning_rate=0.07,
                 max_depth=5,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.95)
best_xgb_model.fit(train_x,train_y)




XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0.03, learning_rate=0.07, max_delta_step=0, max_depth=5,
       min_child_weight=1.5, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=0.75, reg_lambda=0.45,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.95)

In [41]:
model_lassoLars = linear_model.LassoLarsCV(cv=20)
model_lassoLars.fit(X_train, y_train)

LassoLarsCV(copy_X=True, cv=20, eps=2.2204460492503131e-16,
      fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=1,
      normalize=True, positive=False, precompute='auto', verbose=False)

In [33]:
model_xgb = xgb.XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.07) #the params were tuned using xgb.cv
model_xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.07, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [43]:
model_gbr = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60,max_depth=9, min_samples_split=1200, min_samples_leaf=60, subsample=0.8, random_state=10)
model_gbr.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=9, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=60, min_samples_split=1200,
             min_weight_fraction_leaf=0.0, n_estimators=60, presort='auto',
             random_state=10, subsample=0.8, verbose=0, warm_start=False)

In [44]:
xgb_preds = np.expm1(model_xgb.predict(X_test))
lassoLars_preds = np.expm1(model_ridge.predict(X_test))
gbr_preds = np.expm1(model_gbr.predict(X_test))

#http://mlwave.com/kaggle-ensembling-guide/

The weights in the average (0.7, 0.3) are hyperparameters - I think I used a validation set to see what the best cutoff is. Basically this means I am weighting the preds from the lasso somewhat more heavily than the xgboost preds.

In [57]:
preds = 0.9*lassoLars_preds + 0.3* xgb_preds

### For Submission

In [58]:
solution = pd.DataFrame({"id":df_test.Id, "SalePrice":preds}, columns=['id', 'SalePrice'])
solution.to_csv("submission.csv", index = False)