In [2]:
# Import Libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer

from scipy.stats import skew

import warnings
warnings.filterwarnings('ignore') 


In [3]:
#Import Data
df_train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\House_Prices\train.csv")
df_test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\House_Prices\test.csv")
print ('data loaded')
print (str(len(df_train))+" rows for training set")
print (str(len(df_test))+" rows for test set")

data loaded
1460 rows for training set
1459 rows for test set


### Define Median Absolute Deviation Function

In [4]:
def is_outlier(points, thresh = 3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

### Remove Skew from SalesPrice data as required by the competition
Select the last column as target

In [5]:
target = df_train[df_train.columns.values[-1]]
target_log = np.log(target)

### Merge Train and Test to evaluate ranges and missing values excluding the last column
This was done primarily to ensure that Categorical data in the training and testing data sets were consistent.

In [6]:
df_train = df_train[df_train.columns.values[:-1]]
df = df_train.append(df_test, ignore_index = True)

### Find all categorical data

In [7]:
cats = []
for col in df.columns.values:
    if df[col].dtype == 'object':
        cats.append(col)

### Create separte datasets for Continuous vs Categorical

In [8]:
df_cont = df.drop(cats, axis=1)
df_cat = df[cats]

### Handle Missing Data for continuous data
1. If any column contains more than 50 entries of missing data, drop the column
2. If any column contains fewer that 50 entries of missing data, replace those missing values with the median for that column (the median imputation used on missing values is very crude. For example, Area features with missing values may be this way because the property does not have that feature (e.g. a pool) so it would make more sense to set this to zero. )
3. Remove outliers using Median Absolute Deviation
4. Calculate skewness for each variable and if greater than 0.75 transform it
5. Apply the sklearn.Normalizer to each column

In [9]:
for col in df_cont.columns.values:
    if np.sum(df_cont[col].isnull()) > 50:
        #print("Removing Column: {}".format(col))
        df_cont = df_cont.drop(col, axis = 1)
    elif np.sum(df_cont[col].isnull()) > 0:
        #print("Replacing with Median: {}".format(col))
        median = df_cont[col].median()
        idx = np.where(df_cont[col].isnull())[0]
        df_cont[col].iloc[idx] = median
        
        
        outliers = np.where(is_outlier(df_cont[col]))
        df_cont[col].iloc[outliers] = median
        
               
        if skew(df_cont[col]) > 0.75:
            #print("Skewness Detected: {}".format(col))
            df_cont[col] = np.log(df_cont[col])
            df_cont[col] = df_cont[col].apply(lambda x: 0 if x == -np.inf else x)
        
        df_cont[col] = Normalizer().fit_transform(df_cont[col].reshape(1,-1))[0]
        

### Handle Missing Data for Categorical Data
1. If any column contains more than 50 entries of missing data, drop the column
2. If any column contains fewer that 50 entries of missing data, replace those values with the 'MIA'
3. Apply the sklearn.LabelEncoder
4. For each categorical variable determine the number of unique values and for each, create a new column that is binary

In [10]:
for col in df_cat.columns.values:
    if np.sum(df_cat[col].isnull()) > 50:
        df_cat = df_cat.drop(col, axis = 1)
        continue
    elif np.sum(df_cat[col].isnull()) > 0:
        df_cat[col] = df_cat[col].fillna('MIA')
        
    df_cat[col] = LabelEncoder().fit_transform(df_cat[col])
    
    num_cols = df_cat[col].max()
    for i in range(num_cols):
        col_name = col + '_' + str(i)
        df_cat[col_name] = df_cat[col].apply(lambda x: 1 if x == i else 0)
        
    df_cat = df_cat.drop(col, axis = 1)

### Merge Numeric and Categorical Datasets and Create Training and Testing Data

In [11]:
df_new = df_cont.join(df_cat)

df_train = df_new.iloc[:len(df_train) - 1]
df_train = df_train.join(target_log)

df_test = df_new.iloc[len(df_train) + 1:]

X_train = df_train[df_train.columns.values[1:-1]]
y_train = df_train[df_train.columns.values[-1]]

X_test = df_test[df_test.columns.values[1:]]

#### Print the length for checking

In [12]:
print (str(len(y_train))+" rows for training set")
print (str(len(X_train))+" rows for test set")

1459 rows for training set
1459 rows for test set


### Random Forest Regressor

In [13]:
from sklearn.ensemble import RandomForestRegressor
clf_random = RandomForestRegressor(n_estimators=500, n_jobs=-1)

clf_random.fit(X_train, y_train)
y_pred = np.expm1(clf_random.predict(X_test))


### For Submission

In [14]:
solution = pd.DataFrame({"id":df_test.Id, "SalePrice":y_pred}, columns=['id', 'SalePrice'])
solution.to_csv("random_regressor.csv", index = False)