In [44]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
n_rows = train_data.shape[0]

In [4]:
for col in train_data.columns:
    print(col, ':\t', train_data[col].isna().sum()/n_rows*100)

Id :	 0.0
MSSubClass :	 0.0
MSZoning :	 0.0
LotFrontage :	 17.73972602739726
LotArea :	 0.0
Street :	 0.0
Alley :	 93.76712328767123
LotShape :	 0.0
LandContour :	 0.0
Utilities :	 0.0
LotConfig :	 0.0
LandSlope :	 0.0
Neighborhood :	 0.0
Condition1 :	 0.0
Condition2 :	 0.0
BldgType :	 0.0
HouseStyle :	 0.0
OverallQual :	 0.0
OverallCond :	 0.0
YearBuilt :	 0.0
YearRemodAdd :	 0.0
RoofStyle :	 0.0
RoofMatl :	 0.0
Exterior1st :	 0.0
Exterior2nd :	 0.0
MasVnrType :	 0.547945205479452
MasVnrArea :	 0.547945205479452
ExterQual :	 0.0
ExterCond :	 0.0
Foundation :	 0.0
BsmtQual :	 2.5342465753424657
BsmtCond :	 2.5342465753424657
BsmtExposure :	 2.6027397260273974
BsmtFinType1 :	 2.5342465753424657
BsmtFinSF1 :	 0.0
BsmtFinType2 :	 2.6027397260273974
BsmtFinSF2 :	 0.0
BsmtUnfSF :	 0.0
TotalBsmtSF :	 0.0
Heating :	 0.0
HeatingQC :	 0.0
CentralAir :	 0.0
Electrical :	 0.0684931506849315
1stFlrSF :	 0.0
2ndFlrSF :	 0.0
LowQualFinSF :	 0.0
GrLivArea :	 0.0
BsmtFullBath :	 0.0
BsmtHalfBath :	 0.

Above we can see the percentage of missing values for each column. Those with very high percentage of missing values will be dropped.

In [5]:
train_data = train_data.drop(columns=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])

In [6]:
train_data = train_data.set_index('Id', drop=True)

In [7]:
y = pd.Series(train_data['SalePrice'], name='SalePrice')
X = train_data.loc[:, train_data.columns != 'SalePrice']

In [12]:
imputer = SimpleImputer()
scalar = MinMaxScaler()

In [14]:
X_trans = pd.DataFrame(imputer.fit_transform(scalar.fit_transform(X.select_dtypes(exclude='object'))))
X_trans.columns = X.select_dtypes(exclude='object').columns
X_trans.index = X.index

  return self.partial_fit(X, y)


In [15]:
X_trans.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.235294,0.150685,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,0.38646,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,0.090909,0.5
2,0.0,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.0,0.173281,0.0,...,0.324401,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.25
3,0.235294,0.160959,0.046507,0.666667,0.5,0.934783,0.866667,0.10125,0.086109,0.0,...,0.428773,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,0.727273,0.5
4,0.294118,0.133562,0.038561,0.666667,0.5,0.311594,0.333333,0.0,0.038271,0.0,...,0.45275,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0
5,0.235294,0.215753,0.060576,0.777778,0.5,0.927536,0.833333,0.21875,0.116052,0.0,...,0.589563,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,0.5


In [32]:
feature_filter = SelectKBest(f_regression, k=10)

In [33]:
X_new = feature_filter.fit_transform(X_trans, y)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y)

In [45]:
model = GradientBoostingRegressor()

In [46]:
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [47]:
y_pred = model.predict(X_test)

In [48]:
mean_squared_error(y_test, y_pred)

732962977.5480701

In [49]:
model.fit(X_test, y_test)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [50]:
test_data = pd.read_csv('test.csv')

In [65]:
test_id_data = test_data['Id']

In [64]:
col_list = X_trans.columns[feature_filter.get_support()]

In [66]:
test_data = imputer.fit_transform(scalar.fit_transform(test_data[col_list]))

  return self.partial_fit(X, y)


0       1461
1       1462
2       1463
3       1464
4       1465
5       1466
6       1467
7       1468
8       1469
9       1470
10      1471
11      1472
12      1473
13      1474
14      1475
15      1476
16      1477
17      1478
18      1479
19      1480
20      1481
21      1482
22      1483
23      1484
24      1485
25      1486
26      1487
27      1488
28      1489
29      1490
        ... 
1429    2890
1430    2891
1431    2892
1432    2893
1433    2894
1434    2895
1435    2896
1436    2897
1437    2898
1438    2899
1439    2900
1440    2901
1441    2902
1442    2903
1443    2904
1444    2905
1445    2906
1446    2907
1447    2908
1448    2909
1449    2910
1450    2911
1451    2912
1452    2913
1453    2914
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64