ref: this notebook has been fork from https://www.kaggle.com/himaoka/house-simple-svr-support-vector-regression

In [1]:
import warnings
from sklearn.exceptions import DataConversionWarning
# Suppress warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

* [Load Data and Libraries](#load)
* [Check Data](#check-data)
* [Data Pre-Processing](#pre-processing)
* [Training and Prediction](#training-prediction)

# Load Data and Libraries <a id="load"></a>

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

# Set pandas data display option
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)


In [8]:
# Load csv data
train = pd.read_csv("./input/train.csv")
compe = pd.read_csv("./input/test.csv")
sample_sub = pd.read_csv("./input/sample_submission.csv")

# All data
data  = train.append(compe)

# Check Data <a id="check-data"></a>

There's 81 columns in data

In [9]:
# Columns
print(len(data.columns))
data.columns

81


Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation',
       'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea',
       'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'Id', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
       'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning',
       'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold',
       'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual',
       'PavedDrive', 'PoolArea', 'Po

Check what types of data in each columns

In [10]:
# Data example
data.sample(n=20)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,Condition1,Condition2,Electrical,EnclosedPorch,ExterCond,ExterQual,Exterior1st,Exterior2nd,Fence,FireplaceQu,Fireplaces,Foundation,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageType,GarageYrBlt,GrLivArea,HalfBath,Heating,HeatingQC,HouseStyle,Id,KitchenAbvGr,KitchenQual,LandContour,LandSlope,LotArea,LotConfig,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MSZoning,MasVnrArea,MasVnrType,MiscFeature,MiscVal,MoSold,Neighborhood,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,RoofMatl,RoofStyle,SaleCondition,SalePrice,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
340,832,629,0,,4,1Fam,TA,No,0.0,0.0,Unf,Unf,0.0,0.0,TA,832.0,Y,Artery,Norm,FuseF,0,TA,TA,MetalSd,MetalSd,,,0,CBlock,2,Typ,384.0,1.0,TA,Unf,TA,Detchd,1949.0,1461,0,GasA,TA,1.5Fin,1801,1,TA,Lvl,Gtl,10800,Inside,60.0,Reg,0,50,RL,0.0,,,0,8,NAmes,204,7,4,Y,0,,CompShg,Gable,Normal,,WD,0,Pave,6,832.0,AllPub,0,1949,1996,2009
785,1342,0,0,,3,1Fam,TA,No,251.0,630.0,Rec,LwQ,0.0,0.0,TA,491.0,Y,Feedr,Norm,SBrkr,0,TA,TA,BrkFace,Wd Sdng,,Gd,1,CBlock,2,Typ,457.0,2.0,TA,Unf,TA,Attchd,1967.0,1342,0,GasA,TA,1Story,786,1,TA,Lvl,Gtl,9790,Inside,,Reg,0,20,RL,0.0,,,0,9,NWAmes,0,5,6,Y,0,,CompShg,Gable,Normal,161500.0,WD,197,Pave,7,1372.0,AllPub,0,1967,1967,2009
1285,1008,0,0,,2,1Fam,TA,No,658.0,0.0,GLQ,Unf,1.0,0.0,TA,350.0,Y,Norm,Norm,FuseA,0,TA,TA,MetalSd,MetalSd,MnPrv,,0,CBlock,1,Typ,280.0,1.0,TA,RFn,TA,Attchd,1952.0,1008,0,GasA,Ex,1Story,2746,1,TA,Lvl,Gtl,12778,Inside,66.0,Reg,0,20,RL,0.0,,,0,1,NAmes,154,6,5,Y,0,,CompShg,Gable,Normal,,WD,0,Pave,4,1008.0,AllPub,0,1952,2003,2006
114,1436,884,0,,3,1Fam,TA,No,774.0,150.0,ALQ,LwQ,1.0,0.0,TA,104.0,Y,Norm,Norm,SBrkr,0,TA,TA,Wd Sdng,Wd Sdng,MnPrv,TA,1,CBlock,2,Typ,180.0,1.0,TA,Unf,TA,Detchd,1945.0,2320,1,GasA,Ex,2Story,115,1,Gd,Lvl,Mod,7259,Inside,61.0,IR1,0,70,RL,0.0,,,0,7,Crawfor,0,8,6,Y,0,,CompShg,Gambrel,Normal,259500.0,WD,0,Pave,9,1028.0,AllPub,224,1945,2002,2007
502,483,504,0,,2,Twnhs,TA,No,483.0,0.0,LwQ,Unf,0.0,0.0,TA,0.0,Y,Norm,Norm,SBrkr,0,TA,TA,HdBoard,HdBoard,,,0,CBlock,1,Typ,352.0,1.0,TA,Unf,TA,Detchd,1975.0,987,1,GasA,Gd,2Story,1963,1,TA,Lvl,Gtl,1890,Inside,21.0,Reg,0,160,RM,422.0,BrkFace,,0,4,BrDale,0,6,6,Y,0,,CompShg,Gable,Normal,,WD,0,Pave,5,483.0,AllPub,411,1972,1972,2008
686,874,887,0,,3,1Fam,TA,No,0.0,0.0,Unf,Unf,0.0,0.0,Gd,874.0,Y,Norm,Norm,SBrkr,0,TA,Gd,VinylSd,VinylSd,,,0,PConc,3,Typ,578.0,2.0,TA,Fin,TA,Attchd,2007.0,1761,0,GasA,Ex,2Story,687,1,Gd,Lvl,Gtl,10207,Inside,84.0,Reg,0,60,FV,0.0,,,0,8,Somerst,105,6,7,Y,0,,CompShg,Gable,Partial,227875.0,New,0,Pave,7,874.0,AllPub,144,2007,2007,2007
701,1164,0,0,,3,1Fam,TA,No,0.0,0.0,Unf,Unf,0.0,0.0,TA,1164.0,Y,Norm,Norm,SBrkr,0,TA,TA,HdBoard,HdBoard,,,0,CBlock,1,Typ,528.0,2.0,TA,Unf,TA,Attchd,1969.0,1164,1,GasA,TA,1Story,702,1,TA,Lvl,Gtl,9600,Inside,80.0,Reg,0,20,RL,168.0,BrkFace,,0,7,NWAmes,0,5,7,Y,0,,CompShg,Hip,Normal,140000.0,COD,0,Pave,6,1164.0,AllPub,0,1969,1969,2006
1033,1654,0,0,,3,1Fam,TA,No,986.0,0.0,GLQ,Unf,1.0,0.0,Gd,668.0,Y,Norm,Norm,SBrkr,0,TA,Gd,VinylSd,VinylSd,,,0,PConc,2,Typ,900.0,3.0,TA,Unf,TA,Attchd,2002.0,1654,0,GasA,Ex,1Story,1034,1,Gd,Lvl,Gtl,8125,Inside,,Reg,0,20,RL,295.0,Stone,,0,2,CollgCr,136,5,7,Y,0,,CompShg,Gable,Normal,230000.0,WD,0,Pave,6,1654.0,AllPub,0,2002,2002,2006
1318,1084,867,0,Grvl,4,2fmCon,TA,No,0.0,0.0,Unf,Unf,0.0,0.0,TA,938.0,N,Norm,Norm,SBrkr,28,TA,Gd,MetalSd,MetalSd,,,0,PConc,2,Typ,576.0,2.0,TA,Unf,TA,Detchd,1993.0,1951,0,GasA,Gd,1.5Fin,2779,2,Fa,Lvl,Gtl,7745,Inside,56.0,Reg,0,190,RM,0.0,,,0,4,OldTown,6,6,4,P,0,,CompShg,Gable,Normal,,WD,0,Pave,9,938.0,AllPub,0,1900,1950,2006
1325,492,0,0,,1,1Fam,TA,No,416.0,0.0,LwQ,Unf,1.0,0.0,Fa,76.0,Y,Norm,Norm,SBrkr,78,TA,TA,AsbShng,AsbShng,,,0,BrkTil,1,Typ,200.0,1.0,TA,Unf,Fa,Detchd,1921.0,492,0,GasA,TA,1Story,2786,1,TA,Lvl,Gtl,7830,Inside,52.0,Reg,0,30,RM,0.0,,,0,6,OldTown,0,5,3,N,0,,CompShg,Gable,Normal,,WD,0,Pave,3,492.0,AllPub,0,1921,1950,2006


Check types of each variables

In [11]:
types = pd.DataFrame(data.dtypes).rename(columns={0: 'type'}).sort_values(by=['type'],ascending=False)
types

Unnamed: 0,type
Heating,object
FireplaceQu,object
Foundation,object
Functional,object
GarageCond,object
GarageFinish,object
GarageQual,object
GarageType,object
Street,object
HeatingQC,object


For data pre-processing, categorize variables into 'Numerical Variables', 'Categorical Variables(int)', 'Categorical Variables(string)'  
  
**Numerical Variables: **float and int variables  
['MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold']  
  
**Categorical Variables(int): **some of int variables  
['OverallQual', 'OverallCond', 'MoSold']  
  
**Categorical Variables(string): **string variables  
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

Check how many data is missing

In [12]:
# Check missing values
def check_missing(df):
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum()/len(df)
    missing_table = pd.concat([null_val, percent], axis=1)
    col = missing_table.rename(columns = {0 : 'Num', 1 : 'Rate'})
    return col

# Display columns missing values are under 1%.
print("Data #"+str(len(data)))
cols = check_missing(data)
types.join(cols).sort_values(by="Rate", ascending=False)

Data #2919


Unnamed: 0,type,Num,Rate
PoolQC,object,2909,99.657417
MiscFeature,object,2814,96.402878
Alley,object,2721,93.216855
Fence,object,2348,80.438506
SalePrice,float64,1459,49.982871
FireplaceQu,object,1420,48.646797
LotFrontage,float64,486,16.649538
GarageCond,object,159,5.447071
GarageFinish,object,159,5.447071
GarageQual,object,159,5.447071


# Data Pre-Processing <a id="pre-processing"></a>

Drop variables more than 40% data was missing..

In [13]:
# Drop more than 40% missing variables
data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace = True)

Process categorical variables(string)
1. Fill missing data by most frequent value
2. One-Hot Encoding

In [14]:
# Fill missing data and replace with dummy value
categorical_variables_string = \
    ['MSZoning', 'Street', 'LotShape', 'LandContour', 
     'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 
     'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 
     'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
     'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 
     'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
     'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 
     'Electrical', 'KitchenQual', 'Functional', 'GarageType', 
     'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 
     'SaleType', 'SaleCondition']

for v in categorical_variables_string:
    # Fill NaN with mode
    data[v] = data[v].fillna(data[v].mode()[0])
    # One-Hot Encoding
    data = pd.get_dummies(data, columns=[v], drop_first=True)
    # Categorize
    # data[v] = pd.factorize(data[v])[0]

Process categorical variables(int)
1. Do nothing, because there's no missing data

In [15]:
# There's no missing data
categorical_variables_int = \
    ['OverallQual', 'OverallCond', 'MoSold']

Process numerical variables
1. Just fill missing data with average
2. Standardize values

In [16]:
# Fill missing data
numerical_variavles = \
    ['MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 
     'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
     'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
     'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 
     'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
     'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
     'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
     '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold']

ss = StandardScaler()
for v in numerical_variavles:
    # Fill NaN with mean
    data[v] = data[v].fillna(data[v].mean())
    # Standardize values
    data[v] = ss.fit_transform(data[[v]])

Data after processing is like this

In [17]:
# Data example
data.sample(n=10)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,Fireplaces,FullBath,GarageArea,GarageCars,GarageYrBlt,GrLivArea,HalfBath,Id,KitchenAbvGr,LotArea,LotFrontage,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PoolArea,SalePrice,ScreenPorch,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,...,Heating_OthW,Heating_Wall,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
553,-0.697388,1.067397,-0.103331,0.169927,0.352564,-0.29313,-0.819679,3.822419,-0.629896,-0.359601,-0.924311,0.781366,0.005227,0.306528,0.598733,0.354255,1.232599,2014,-0.207698,-0.214074,-0.906006,-0.101197,0.067331,0.099659,-0.089592,10,0.718034,5,6,-0.06315,,-0.285935,0.349546,-0.376241,0.397535,0.716075,0.466021,0.157646,0,0,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
76,-0.529147,-0.785025,-0.103331,-1.045801,-0.468594,-0.29313,-0.819679,-0.249895,0.371485,-0.359601,-0.924311,-1.027363,-0.881821,-1.006906,-0.889392,-1.084582,-0.756321,77,-0.207698,-0.214708,0.0,-0.101197,-0.873616,-0.57225,-0.089592,4,-0.702843,7,4,-0.06315,135750.0,-0.285935,-1.562366,-0.226451,-0.74076,-0.505602,-1.352965,0.157646,0,0,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
144,1.448957,-0.785025,-0.103331,3.817112,1.955357,-0.29313,1.087023,-0.249895,-0.374999,-0.359601,-0.924311,0.781366,0.144554,0.306528,-0.607855,0.449123,-0.756321,145,4.455931,-0.13545,0.032578,-0.101197,0.773042,1.309096,-0.089592,11,-0.702843,5,5,-0.06315,125000.0,-0.285935,2.261457,1.534723,-0.74076,-0.274474,-1.017889,-1.363569,0,0,0,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
471,-1.365253,0.761771,-0.103331,0.169927,-0.244642,-0.29313,-0.819679,-0.249895,-0.607138,-0.359601,-0.924311,-1.027363,-0.152677,0.306528,0.92049,-0.42248,1.232599,1932,-0.207698,0.033084,-0.342855,-0.101197,0.067331,-0.57225,-0.089592,7,-0.702843,5,5,-0.06315,,-0.285935,-0.287758,-0.970864,0.444964,0.683057,0.370284,0.918253,0,0,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
374,-0.46287,1.282035,-0.103331,3.817112,-0.598135,-0.29313,-0.819679,-0.249895,0.66052,1.228362,-0.924311,0.781366,-2.196138,-2.320339,0.0,0.717916,1.232599,1835,-0.207698,-0.547338,-1.328368,-0.101197,3.125411,0.480408,-0.089592,5,2.079708,5,7,-0.06315,,-0.285935,1.624153,-0.072121,-0.74076,-2.288589,0.753229,0.918253,0,0,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
351,-0.914062,0.722109,-0.103331,0.169927,-0.969192,-0.29313,-0.819679,-0.249895,0.546727,3.439056,-0.924311,-1.027363,-2.196138,-2.320339,0.0,-0.106252,-0.756321,1812,-0.207698,-0.004326,-0.62443,-0.101197,-0.167905,-0.57225,-0.089592,7,-0.702843,5,5,-0.06315,,-0.285935,-0.287758,-0.569153,-0.108374,-2.024443,-1.640173,0.918253,0,0,0,1,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1370,-0.814647,0.46081,-0.103331,-1.045801,-0.277576,0.327629,-0.819679,-0.249895,-0.320379,2.722915,-0.924311,-1.027363,-0.626388,-1.006906,-0.446977,-0.250531,-0.756321,1371,-0.207698,-0.604657,0.971162,-0.101197,-0.167905,-0.57225,-0.089592,10,-0.702843,6,4,-0.06315,105000.0,-0.285935,-0.287758,-0.480641,-0.74076,-1.69426,-1.640173,0.918253,0,0,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
610,-1.242896,-0.785025,-0.103331,-1.045801,-0.969192,-0.29313,-0.819679,-0.249895,0.25314,-0.359601,-0.924311,-1.027363,-0.765715,-1.006906,0.759611,-1.637981,-0.756321,2071,-0.207698,-0.604657,-0.436714,-0.101197,-0.638379,-0.57225,-0.089592,7,0.895644,7,4,-0.06315,,-0.285935,-1.562366,-0.861926,-0.045135,-1.033894,0.992569,0.157646,0,0,1,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
520,-1.186816,0.61479,-0.103331,0.169927,-0.969192,-0.29313,-0.819679,-0.249895,-1.276243,2.909735,-0.924311,0.781366,-2.196138,-2.320339,0.0,-0.408645,-0.756321,521,4.455931,0.080131,-0.436714,-0.101197,3.125411,-0.57225,-0.089592,8,0.984448,7,4,-0.06315,106250.0,-0.285935,0.349546,-2.387066,0.998302,-2.354625,0.753229,0.157646,0,0,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1255,-1.604869,0.498139,-0.103331,-1.045801,-0.969192,-0.29313,-0.819679,-0.249895,-0.070033,-0.359601,-0.924311,0.781366,0.1074,0.306528,1.000929,-0.8316,1.232599,2716,-0.207698,-0.72551,0.0,-0.101197,2.4197,-0.57225,-0.089592,7,0.037197,5,7,-0.06315,,-0.285935,-1.562366,-1.184202,-0.74076,1.046258,0.896833,-1.363569,1,0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [18]:
# Set data
train = data[:1460]
test  = data[1460:]

# Training and Prediction <a id="training-prediction"></a>

To select parameters for training, use feature selection library

In [19]:
possible_features = train.columns.copy().drop('SalePrice').drop('Id')

# Check feature importances
selector = SelectKBest(f_regression, len(possible_features))
selector.fit(train[possible_features], train['SalePrice'])
scores = -np.log10(selector.pvalues_)
indices = np.argsort(scores)[::-1]
print('Feature importances:')
for i in range(len(scores)):
    print('%.2f %s' % (scores[indices[i]], possible_features[indices[i]]))

Feature importances:
312.66 OverallQual
222.35 GrLivArea
168.60 GarageCars
157.28 GarageArea
151.02 TotalBsmtSF
146.27 1stFlrSF
136.37 ExterQual_TA
120.91 FullBath
107.56 TotRmsAbvGrd
102.52 YearBuilt
100.89 KitchenQual_TA
98.48 GarageFinish_Unf
95.50 YearRemodAdd
91.85 BsmtQual_TA
91.51 Foundation_PConc
82.39 MasVnrArea
80.79 GarageYrBlt
79.21 Fireplaces
73.86 ExterQual_Gd
67.59 BsmtFinType1_GLQ
57.14 Neighborhood_NridgHt
52.47 BsmtFinSF1
47.17 MasVnrType_None
44.53 SaleType_New
43.66 GarageType_Detchd
43.12 SaleCondition_Partial
40.91 Foundation_CBlock
38.85 LotFrontage
37.82 MasVnrType_Stone
37.80 Neighborhood_NoRidge
36.40 WoodDeckSF
35.76 KitchenQual_Gd
35.39 BsmtExposure_No
35.24 2ndFlrSF
34.46 OpenPorchSF
33.75 HeatingQC_TA
32.58 BsmtExposure_Gd
32.38 Exterior2nd_VinylSd
32.08 Exterior1st_VinylSd
28.57 MSZoning_RM
27.78 HalfBath
24.65 LotShape_Reg
23.95 LotArea
21.74 CentralAir_Y
20.68 MSZoning_RL
20.32 HouseStyle_2Story
20.27 SaleType_WD
20.23 Electrical_SBrkr
19.18 RoofStyle_H

This time, pick variables by their importances  
  
**Possible features(Ordered by importances)**  
    ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', 
     '1stFlrSF', 'ExterQual_TA', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 
     'KitchenQual_TA', 'GarageFinish_Unf', 'YearRemodAdd', 'BsmtQual_TA', 
     'Foundation_PConc', 'MasVnrArea', 'GarageYrBlt', 'Fireplaces', 
     'ExterQual_Gd', 'BsmtFinType1_GLQ', 'Neighborhood_NridgHt', 'BsmtFinSF1', 
     'MasVnrType_None', 'SaleType_New', 'GarageType_Detchd', 'SaleCondition_Partial', 
     'Foundation_CBlock', 'LotFrontage', 'MasVnrType_Stone', 'Neighborhood_NoRidge', 
     'WoodDeckSF', 'KitchenQual_Gd', 'BsmtExposure_No', '2ndFlrSF', 'OpenPorchSF', 
     'HeatingQC_TA', 'BsmtExposure_Gd', 'Exterior2nd_VinylSd', 'Exterior1st_VinylSd', 
     'MSZoning_RM', 'HalfBath', 'LotShape_Reg', 'LotArea', 'CentralAir_Y', 'MSZoning_RL', 
     'HouseStyle_2Story', 'SaleType_WD', 'Electrical_SBrkr', 'RoofStyle_Hip', 'GarageType_BuiltIn', 
     'BsmtQual_Gd', 'GarageType_Attchd', 'PavedDrive_Y', 'BsmtFullBath', 'RoofStyle_Gable', 
     'Neighborhood_StoneBr', 'BsmtUnfSF', 'MasVnrType_BrkFace', 'Neighborhood_OldTown', 
     'Neighborhood_NAmes', 'Neighborhood_Edwards', 'GarageFinish_RFn', 'RoofMatl_WdShngl', 
     'BedroomAbvGr', 'Exterior1st_MetalSd', 'Neighborhood_IDOTRR', 'Exterior2nd_MetalSd', 
     'Exterior2nd_Wd Sdng', 'Exterior1st_Wd Sdng', 'KitchenQual_Fa', 'SaleCondition_Normal', 
     'Neighborhood_BrkSide', 'LotConfig_CulDSac', 'Neighborhood_Somerst', 'ExterCond_Fa', 
     'GarageCond_TA', 'KitchenAbvGr', 'BsmtFinType1_Rec', 'HeatingQC_Gd', 'HeatingQC_Fa', 
     'Exterior1st_CemntBd', 'GarageQual_Fa', 'BsmtFinType1_Unf', 'BsmtFinType1_BLQ', 
     'GarageCond_Fa', 'BsmtQual_Fa', 'EnclosedPorch', 'Neighborhood_Sawyer', 'Exterior2nd_CmentBd', 
     'Electrical_FuseF', 'Neighborhood_Timber', 'LotShape_IR2', 'LandContour_HLS', 'Foundation_Slab', 
     'Condition1_Feedr', 'Functional_Typ', 'ExterQual_Fa', 'BldgType_Duplex', 'Condition1_Norm', 
     'Neighborhood_MeadowV', 'ScreenPorch', 'ExterCond_TA', 'RoofMatl_CompShg', 'Neighborhood_BrDale', 
     'BldgType_Twnhs', 'BldgType_2fmCon', 'GarageQual_TA', 'Exterior1st_HdBoard', 'HouseStyle_SFoyer', 
     'Heating_GasA', 'PoolArea', 'Heating_Grav', 'MSZoning_FV', 'BsmtCond_Gd', 'PavedDrive_P', 
     'HouseStyle_1.5Unf', 'BsmtFinType1_LwQ', 'MSSubClass', 'LotConfig_Inside', 'OverallCond', 
     'Exterior2nd_ImStucc', 'Neighborhood_CollgCr', 'Functional_Min2', 'Neighborhood_Crawfor', 
     'GarageType_CarPort', 'Functional_Maj2', 'Exterior2nd_HdBoard', 'MSZoning_RH', 'Functional_Min1', 
     'Neighborhood_SWISU', 'Neighborhood_Veenker', 'GarageCond_Po', 'HouseStyle_1Story', 'Heating_Wall', 
     'Neighborhood_Mitchel', 'BsmtFinType2_BLQ', 'BsmtFinType2_Unf', 'Neighborhood_ClearCr', 
     'BsmtCond_Po', 'Exterior2nd_Plywood', 'Exterior1st_WdShing', 'Exterior1st_BrkComm', 
     'SaleCondition_AdjLand', 'ExterCond_Gd', 'Condition1_PosN', 'Condition2_PosN', 'Condition2_Feedr', 
     'Electrical_FuseP', 'Condition2_PosA', 'Exterior2nd_Brk Cmn', 'Condition1_RRAe', 
     'SaleCondition_Family', 'MoSold', 'GarageQual_Po', 'LandContour_Low', 'Exterior2nd_Other', 
     'RoofMatl_WdShake', '3SsnPorch', 'BsmtExposure_Mn', 'GarageQual_Gd', 'LandSlope_Mod', 
     'Exterior2nd_Stucco', 'Condition1_PosA', 'SaleType_ConLD', 'SaleType_Con', 'Street_Pave', 
     'Exterior2nd_Wd Shng', 'BsmtFinType2_Rec', 'Condition2_RRNn', 'HouseStyle_SLvl', 
     'Neighborhood_NPkVill', 'BsmtFinType2_LwQ', 'Electrical_Mix', 'LotShape_IR3', 'HouseStyle_2.5Fin', 
     'Exterior1st_Stone', 'Neighborhood_Gilbert', 'RoofStyle_Gambrel', 'SaleType_Oth', 'ExterCond_Po', 
     'Exterior1st_BrkFace', 'HeatingQC_Po', 'Condition2_Norm', 'Exterior1st_Stucco', 'GarageType_Basment', 
     'YrSold', 'LandSlope_Sev', 'LandContour_Lvl', 'SaleType_ConLw', 'Exterior1st_ImStucc', 
     'Exterior1st_AsphShn', 'HouseStyle_2.5Unf', 'Heating_OthW', 'LowQualFinSF', 'Exterior2nd_CBlock', 
     'Exterior1st_CBlock', 'BsmtCond_TA', 'Exterior2nd_BrkFace', 'Exterior2nd_AsphShn', 
     'Neighborhood_NWAmes', 'Condition1_RRNn', 'MiscVal', 'RoofStyle_Shed', 'Neighborhood_Blueste', 
     'Heating_GasW', 'RoofMatl_Membran', 'SaleType_CWD', 'LotConfig_FR3', 'Exterior1st_Plywood', 
     'Functional_Sev', 'BsmtHalfBath', 'Exterior2nd_Stone', 'Functional_Mod', 'SaleCondition_Alloca', 
     'Neighborhood_SawyerW', 'Condition2_RRAn', 'RoofMatl_Roll', 'SaleType_ConLI', 'Utilities_NoSeWa', 
     'Foundation_Stone', 'BsmtFinSF2', 'LotConfig_FR2', 'Condition1_RRAn', 'RoofMatl_Tar&Grv', 
     'Condition1_RRNe', 'BldgType_TwnhsE', 'Condition2_RRAe', 'Foundation_Wood', 'GarageCond_Gd', 
     'RoofStyle_Mansard', 'RoofMatl_Metal', 'BsmtFinType2_GLQ']

In [20]:
# Feature params
fparams = \
    ['OverallQual', 'YearBuilt']

# Get params
train_target = train["SalePrice"].values
train_features = train[fparams].values
test_features  = test[fparams].values

Here's just use SVR for prediction, with GridSearch

In [21]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split

svrgs_parameters = {
    'kernel': ['rbf'],
    'C':     [150000,200000,250000],
    'gamma': [0.004,0.0045,0.005]
}

svr_cv = GridSearchCV(svm.SVR(), svrgs_parameters, cv=8, scoring= 'neg_mean_squared_log_error')
svr_cv.fit(train_features, train_target)
print("SVR GridSearch score: "+str(svr_cv.best_score_))
print("SVR GridSearch params: ")
print(svr_cv.best_params_)

SVR GridSearch score: -0.0514210457436
SVR GridSearch params: 
{'C': 250000, 'gamma': 0.004, 'kernel': 'rbf'}


Output prediction result to a file

In [24]:
prediction = svr_cv.best_estimator_.predict(test_features)
pred = pd.DataFrame(pd.read_csv("./input/test.csv")['Id'])
pred['SalePrice'] = prediction
pred.to_csv("./input/submission_out.csv", index = False)