In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as linear_model
from sklearn.linear_model import LinearRegression
from matplotlib.ticker import FormatStrFormatter
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('./train.csv')
submit = pd.read_csv('./test.csv')

### Feature Selection + Data Cleaning

My initial assumption for starting the modeling process was to start with a few features and work up in complexity until I found a balance that yielded the best score.  This started by selecting a couple features that had a strong correlation to SalePrice.  Later, I added more and more features to the model which ended up returning my best score, or lowest RMSE.

In [37]:
df.corrwith(df['SalePrice']).sort_values(ascending = False)
#General idea of which columns correlated strongly with SalePrice

##features = ['Year Built','Year Remod/Add','Gr Liv Area'
         ##,'Total Bsmt SF','Garage Area','1st Flr SF', 'Full Bath','Fireplaces','TotRms AbvGrd']

##sns.heatmap(df[features].corr())

#Model 1 correlations between predictors

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.649897
Garage Cars        0.647781
Total Bsmt SF      0.629303
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
TotRms AbvGrd      0.504014
Mas Vnr Area       0.503579
Fireplaces         0.471093
BsmtFin SF 1       0.423856
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283332
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190861
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Mo Sold            0.032735
Pool Area          0.023106
BsmtFin SF 2       0.016432
Misc Val          -0.007375
Yr Sold           -0.015203
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045290
Id                -0.051398
MS SubClass       -0.087335
Overall Cond      -0.097019
Kitchen AbvGr     -0

Here we some code directly form my model one where in I selected a variety of numerical features that correlated with SalePriece

### Data Cleaning

Data cleaning had presented itself as the most important aspect for producing a good score.  There were several values that lacked proper data (presumably the standard for data scientists) that I needed to fill or replace to run in my model.  I found out later that this was not the right practice and when discussing with class mates I found that the proper method to handle these missing values was to either drop them entirely or replace them with a more telling value such as mean or median.  That being said, I will defend my actions for choosing zeros and none for the data.

I found that many of the circumstances in which data was missing was likely due to a feature not being on a house.  I often confirmed this by finding the matching values of other identifiers.  Example being, garage-finish and garage sqft.  I found that if a house had no information regarding their garage/basement of other feature I would presume there was nothing there at all.  Thus, filling categorical values with None (later dummying them) and numericals with 0 would yield the proper data.

In [4]:
df['Garage Finish'].fillna('None', inplace = True)
df['Garage Qual'].fillna('None', inplace = True)
df['Garage Finish'].fillna('None', inplace = True)
df['Garage Type'].fillna('None', inplace = True)
df['Garage Cond'].fillna('None', inplace = True)
df['Bsmt Exposure'].fillna('None', inplace = True)
df['BsmtFin Type 2'].fillna('None', inplace = True)
df['BsmtFin Type 1'].fillna('None', inplace = True)
df['Bsmt Cond'].fillna('None', inplace = True)
df['Bsmt Qual'].fillna('None', inplace = True)
df['Mas Vnr Type'].fillna('None', inplace = True)
df['Garage Yr Blt'].fillna('None', inplace = True)


df['Garage Cars'].fillna(0, inplace=True)
df['Garage Area'].fillna(0, inplace=True)
df['Total Bsmt SF'].fillna(0, inplace = True)
df['BsmtFin SF 1'].fillna(0, inplace = True)
df['Bsmt Full Bath'].fillna(0, inplace = True)
df['Bsmt Unf SF'].fillna(0, inplace = True)
df['Bsmt Half Bath'].fillna(0, inplace = True)
df['Mas Vnr Area'].fillna(0, inplace = True)
df['BsmtFin SF 2'].fillna(0, inplace = True)

df.drop(['Pool QC','Misc Feature','Alley','Fence','Fireplace Qu'], axis = 1, inplace = True)

submit['Mas Vnr Area'].fillna(0, inplace = True)

#Dropping and cleaning a variety of test predictors,  many of these were used during my modeling stage.

After cleaning I determined my features.  The first step to this was splitting them into numerical and categorical values.  This was important because I often ran transformative functions on the numerical data before my categorical ones.  Later, I would dummy out my categorical columns to produce binary values corresponding to the individual categories.

In [5]:
numeric = df._get_numeric_data()
num_cols = df._get_numeric_data().drop(['Id','PID'], axis = 1).columns
num_cols.shape

categorical = df.select_dtypes(exclude=["number","bool_"]).columns

numeric_sub = submit._get_numeric_data
num_cols_sub = submit._get_numeric_data().columns

#Numeric and Categorical columns for a general test including nearly all predictors

In [6]:
features= num_cols_sub

In [7]:
dummies = pd.get_dummies(df[categorical])

#Dummies created from categorical features

In [8]:
#dummies.corrwith(df['SalePrice']).sort_values(ascending = False)
#Corr_test = dummies.drop(features,axis = 1, inplace = True)

In [9]:
dummies_features = ['Foundation_PConc','BsmtFin Type 1_GLQ','Neighborhood_NridgHt',
         'Exter Qual_Gd','Bsmt Exposure_Gd','Sale Type_New','Garage Type_Attchd','Exterior 1st_VinylSd',
         'Exterior 2nd_VinylSd','Mas Vnr Type_Stone','Kitchen Qual_Gd','Paved Drive_Y','Central Air_Y','Garage Cond_TA',
         'Roof Style_Hip','Neighborhood_NoRidge','Mas Vnr Type_BrkFace','Neighborhood_StoneBr','Electrical_SBrkr',
                'MS Zoning_RM','Bsmt Exposure_No','Lot Shape_Reg','Heating QC_TA','Foundation_CBlock',
                'Garage Type_Detchd','Mas Vnr Type_None','Garage Finish_Unf','Bsmt Qual_TA','Kitchen Qual_TA','Exter Qual_TA']

In [35]:
df_corr = df.corrwith(df['SalePrice'])

#Determining highest positive and negative correlations

In [11]:
#sorted_df_corr = df_corr.sort_values(ascending = False)

#sorted_df_corr

#Sorted Corrs

In [12]:
#GridSearch from previous test

#grid_params = {
 #   ]
#}
#gs = GridSearchCV(
  #  LinearRegression(),
 #   grid_params,)

#gs_results = gs.fit(X_train,y_train)

Determining my X and Y values

In [13]:
X = df[features]
X.drop(['Garage Yr Blt'], axis = 1, inplace = True)
X_submit = submit[features]
X_submit.drop(['Garage Yr Blt'], axis = 1, inplace = True)
y = df['SalePrice']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


I included many of the steps I took during my testing phases.  While many of these worked in theory they often did not produce a score better than the simplest linear model.  

In [14]:
#Polyfeatures

#from sklearn.preprocessing import PolynomialFeatures
#pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
#pf = pf.fit(X)
#X = pf.transform(X)

#X = pd.DataFrame(X)

In [15]:
#Polyfeatures

#pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
#pf = pf.fit(X_submit)
#X_submit = pf.transform(X_submit)

#X_submit = pd.DataFrame(X_submit)

In [16]:
dummies = pd.get_dummies(df,columns =categorical,drop_first = True)
submit_dummies = pd.get_dummies(submit,columns =categorical, drop_first = True)


In [17]:
#Features from previous tests


#X[['Overall Qual_2','Overall Qual_3',
             # 'Overall Qual_4','Overall Qual_5','Overall Qual_6',
              #'Overall Qual_7','Overall Qual_8','Overall Qual_9','Overall Qual_10']] = df[['Overall Qual_2','Overall Qual_3',
              #'Overall Qual_4','Overall Qual_5','Overall Qual_6',
              #'Overall Qual_7','Overall Qual_8','Overall Qual_9','Overall Qual_10']]

In [18]:
#Features from pevious tests


#X_submit[['Overall Qual_2','Overall Qual_3',
              #'Overall Qual_4','Overall Qual_5','Overall Qual_6',
              #'Overall Qual_7','Overall Qual_8','Overall Qual_9','Overall Qual_10']] = submit[['Overall Qual_2','Overall Qual_3',
              #'Overall Qual_4','Overall Qual_5','Overall Qual_6',
              #'Overall Qual_7','Overall Qual_8','Overall Qual_9','Overall Qual_10']]

In [19]:
submit_dummies_final = submit_dummies[dummies_features]
dummies_final = dummies[dummies_features]

X.shape

(2051, 37)

In [20]:
X_submit=pd.concat([X_submit,submit_dummies_final],axis = 1)
X=pd.concat([X,dummies_final],axis = 1)

I concatenate the two data types into one X, on both the training and test data.  Thisway we can fit, score and predict on our test set.  For validation purposes I train test split, splitting my test set into 2 an allowing me to cross validate my model.

In [21]:
X = X.drop('Lot Frontage', axis = 1)
X_submit = X_submit.drop('Lot Frontage', axis = 1)

In [22]:
#ss = StandardScaler()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [24]:
#X_train_scaled = ss.fit_transform(X_train)
#X_test_scaled = ss.transform(X_test)

In [25]:
X_train.dtypes

Id                        int64
PID                       int64
MS SubClass               int64
Lot Area                  int64
Overall Qual              int64
Overall Cond              int64
Year Built                int64
Year Remod/Add            int64
Mas Vnr Area            float64
BsmtFin SF 1            float64
BsmtFin SF 2            float64
Bsmt Unf SF             float64
Total Bsmt SF           float64
1st Flr SF                int64
2nd Flr SF                int64
Low Qual Fin SF           int64
Gr Liv Area               int64
Bsmt Full Bath          float64
Bsmt Half Bath          float64
Full Bath                 int64
Half Bath                 int64
Bedroom AbvGr             int64
Kitchen AbvGr             int64
TotRms AbvGrd             int64
Fireplaces                int64
Garage Cars             float64
Garage Area             float64
Wood Deck SF              int64
Open Porch SF             int64
Enclosed Porch            int64
                         ...   
Foundati

### Modeling and Scoring

I fit my model with my train and test sets to score them accordingly.  After I found a satisfactory score between my train and test sets I would go and fit my model with the entire dataset.  I found that fitting my model with only the X_train it would yield a worse prediction score.  I presumed this was due to only using 50% of the potential data for predictions.  

In [40]:
lm = linear_model.LinearRegression()

model = lm.fit(X_train,y_train)

predictions  =  model.predict(X_submit)
score        =  model.score(X_train,y_train)
betacoef     =  model.coef_

In [41]:
score

0.887215446212806

In [43]:
lm = linear_model.LinearRegression()

model = lm.fit(X_train,y_train)

predictions  =  model.predict(X_submit)
score        =  model.score(X_test,y_test)
betacoef     =  model.coef_

In [44]:
score

0.8377974650651789

### Submission

Applying my predictions to the test set and creating my Submission.

In [46]:
#X_submit_scaled=ss.fit_transform(X_submit)

submit['SalePrice'] = predictions



submission10 = submit[['Id','SalePrice']]

submission10.to_csv("Submission10.csv")

#Submission process