In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import heapq
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#Load training, testing, and validation data
data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
data = data.sample(frac=1).reset_index(drop=True)
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print(data.shape)

In [3]:
#Split into train and validation
N_TRAIN = 1101

train_data = data.iloc[0:N_TRAIN-1]
valid_data = data.iloc[N_TRAIN:-1]
train_data.head(15)

In [5]:
#Data Cleaning

#Drop columns with too many null values
for col_name in train_data.columns:
    num_null = train_data[col_name].isnull().sum()
    if num_null > 250:
        train_data.drop(col_name, axis = 1, inplace = True)
        valid_data.drop(col_name, axis = 1, inplace = True)
        test_data.drop(col_name, axis = 1, inplace = True)

#Drop rows with too many null values
drop_row_list = []
for row_num in range(N_TRAIN-1):
    row = train_data.iloc[row_num]
    num_null = row.isnull().sum()
    if num_null > 5:
        #Drop row
        drop_row_list.append(row_num)
print(drop_row_list)
train_data.drop(drop_row_list, axis = 0, inplace = True)


#Reason to drop this column comes from later in EDA
train_data.drop('GarageYrBlt', axis = 1, inplace = True)
valid_data.drop('GarageYrBlt', axis = 1, inplace = True)
test_data.drop('GarageYrBlt', axis = 1, inplace = True)

In [6]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = list(train_data.select_dtypes(include=numerics).columns)
numeric_cols.remove('Id')
numeric_cols.remove('SalePrice')
print(numeric_cols)
categorical_cols = []
for col_name in train_data.columns:
    if col_name not in numeric_cols and col_name != 'Id' and col_name != 'SalePrice':
        categorical_cols.append(col_name)

In [7]:
#Fill NULL values
for col_name in numeric_cols:
    print(col_name, ': ', train_data[col_name].isnull().sum())
train_data['LotFrontage'].fillna(0.0, inplace = True)
train_data['MasVnrArea'].fillna(0.0, inplace = True)
valid_data['LotFrontage'].fillna(0.0, inplace = True)
valid_data['MasVnrArea'].fillna(0.0, inplace = True)
test_data['LotFrontage'].fillna(0.0, inplace = True)
test_data['MasVnrArea'].fillna(0.0, inplace = True)

In [None]:
#Create slideshow of plots
#For each numerical column, do a scatter plot
print(train_data.shape)
#train_data['MSSubClass'].head()
for col_name in numeric_cols:
    if col_name != 'Id':
        plt.scatter(train_data[col_name], train_data['SalePrice'])
        plt.xlabel(col_name)
        plt.ylabel('SalePrice')
        plt.show()
    junk = input('Press any key to continue')

In [8]:
#Implement first set of notes

#Categoricalize new columns
new_categoricals = ['MSSubClass','OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath',
                   'BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','MoSold','YrSold']
for col_name in new_categoricals:
    if col_name in numeric_cols:
        numeric_cols.remove(col_name)
        categorical_cols.append(col_name)
    train_data[col_name] = train_data[col_name].astype(int)
    valid_data[col_name].fillna(0.0, inplace = True)
    valid_data[col_name] = valid_data[col_name].astype(int)
    test_data[col_name].fillna(0.0, inplace = True)
    test_data[col_name] = test_data[col_name].astype(int)
    
#Discretize PoolArea column
train_data['PoolArea'] = train_data['PoolArea'].apply(lambda x : 'Y' if x > 0 else 'N')
valid_data['PoolArea'] = valid_data['PoolArea'].apply(lambda x : 'Y' if x > 0 else 'N')
test_data['PoolArea'] = test_data['PoolArea'].apply(lambda x : 'Y' if x > 0 else 'N')
numeric_cols.remove('PoolArea')
categorical_cols.append('PoolArea')

#Add new categorical columns to see if things are zero
for col_name in numeric_cols:
    train_data[col_name + '_is_zero'] = train_data[col_name].apply(lambda x : 'Y' if x < 0.001 else 'N')
    valid_data[col_name + '_is_zero'] = valid_data[col_name].apply(lambda x : 'Y' if x < 0.001 else 'N')
    test_data[col_name + '_is_zero'] = test_data[col_name].apply(lambda x : 'Y' if x < 0.001 else 'N')
    categorical_cols.append(col_name + '_is_zero')

In [None]:
#For each categorical column, do a bar chart
for col_name in categorical_cols:
    x_axis = list(train_data[col_name].unique())
    y_values = []
    for x_value in x_axis: 
        y_values.append(train_data[train_data[col_name] == x_value]['SalePrice'].mean())
    plt.bar(x_axis, y_values)
    plt.show()
    junk = input('Press any key to continue')

print(train_data['Street'].unique())

**Notes from Slideshow**
--Make sure to always int cast before categoricalizing

1. Categoricalize MsSubClass
2. Categoricalize OverallQual
3. Categoricalize OverallCond
4. TotalBsmtSF, 1stFlrSf, 2ndFlrSF, GrLivArea, has the highest correlation with SalePrice I have seen
5. Categoricalize BsmtFullBath, BsmtHalfBath, FullBath, HalfBath
6. Categoricalize BedroomAbvGr, KitchenAbvGr, TotalRmsAbvGrd, Fireplaces, GarageCars
7. Categoricalize PoolArea as 0, >0
8. Categoricalize MoSold, YrSold

In [9]:
#Look at the correlations among all the numerical columns
#print(numeric_cols)
train_data_corr = train_data[numeric_cols + ['SalePrice']].corr()
train_data_corr.sort_values(by = 'SalePrice', inplace = True)
train_data_corr.head(50)

In [10]:
#Random forest regressor needs numerical columns for some strange reason
def numericalize(col_name, df):
    set_of_values = set()
    for value in df[col_name].values:
        set_of_values.add(value)
    index = 0
    my_dict = {}
    for value in set_of_values:
        my_dict[value] = index
        index = index+1
    return df[col_name].replace(to_replace = my_dict)

categorical_cols_num = []
for col in categorical_cols:
    categorical_cols_num.append(col + '_num')
    train_data[col + '_num'] = numericalize(col, train_data)
    valid_data[col + '_num'] = numericalize(col, valid_data)
    test_data[col + '_num'] = numericalize(col, test_data)

In [11]:
#Implement scaling features, first convert to float
#Helps for some models, not sure about XGBRegressor
for col_name in numeric_cols:
    train_data[col_name] = train_data[col_name].astype(float)
    valid_data[col_name] = valid_data[col_name].astype(float)
    test_data[col_name] = test_data[col_name].astype(float)
    mean_value = train_data[col_name].mean()
    std_value = train_data[col_name].std()
    train_data[col_name] = (train_data[col_name]-mean_value)/std_value
    valid_data[col_name] = (valid_data[col_name]-mean_value)/std_value
    test_data[col_name] = (test_data[col_name]-mean_value)/std_value

In [20]:
def model_with_no_outliers(model, cols_list):
    stdev = train_data['SalePrice'].std()
    drop_list = []

    off_amounts = abs(train_data['SalePrice'] - model.predict(train_data[cols_list]))
    num_rows = train_data.shape[0]

    for i in range(num_rows):
        if off_amounts.iloc[i] > 2*stdev:
            drop_list.append(i)
    train_data_no_outliers = train_data.drop(drop_list, axis=0).reset_index(drop = True)
    model = model.fit(train_data_no_outliers[cols_list], train_data_no_outliers['SalePrice'])
    return model

In [27]:
#Test different models
model1 = LinearRegression().fit(train_data[numeric_cols], train_data['SalePrice'])
print(model1.score(valid_data[numeric_cols], valid_data['SalePrice']))
model1 = model_with_no_outliers(model1, numeric_cols)
print(model1.score(valid_data[numeric_cols], valid_data['SalePrice']))

model2 = RandomForestRegressor(n_estimators=80,max_depth=8, random_state = 10).fit(train_data[categorical_cols_num], train_data['SalePrice'])
print(model2.score(valid_data[categorical_cols_num], valid_data['SalePrice']))
model2 = model_with_no_outliers(model2, categorical_cols_num)
print(model2.score(valid_data[categorical_cols_num], valid_data['SalePrice']))

model3 = xgb.XGBRegressor(max_depth = 3).fit(train_data[categorical_cols_num], train_data['SalePrice'])
print(model3.score(valid_data[categorical_cols_num], valid_data['SalePrice']))
model3 = model_with_no_outliers(model3, categorical_cols_num)
print(model3.score(valid_data[categorical_cols_num], valid_data['SalePrice']))

model4 = xgb.XGBRegressor(max_depth = 5).fit(train_data[numeric_cols + categorical_cols_num], train_data['SalePrice'])
model4 = model_with_no_outliers(model4, numeric_cols + categorical_cols_num)
print(model4.score(valid_data[numeric_cols + categorical_cols_num], valid_data['SalePrice']))
#model5 = VotingRegressor(estimators = [('2', model2), ('4', model4)], weights = [0.6, 0.4])



In [29]:
best_model = model4
test_data['SalePrice'] = best_model.predict(test_data[numeric_cols + categorical_cols_num])
ans = test_data[['Id', 'SalePrice']]
ans.to_csv('tuned_output.csv')