In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression,f_regression
from sklearn.model_selection import train_test_split

In [116]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [117]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [118]:
train_df.shape

(1460, 81)

In [119]:
test_df.shape

(1459, 80)

In [120]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Data Cleaning

Takeout the columns with more than 50% missing value

In [121]:
train_df=train_df.loc[:,train_df.columns[(train_df.isnull().sum()/1460)<0.5]]

In [122]:
test_df=test_df.loc[:,train_df.columns]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Take out columns with mostly the same values

In [123]:
dropped_columns=[]
for i in train_df.columns:
    if train_df[i].value_counts(normalize=True).values[0]>0.8:
        dropped_columns.append(i)

In [124]:
new_columns=list(set(train_df.columns)-set(dropped_columns))
train_df=train_df.loc[:,new_columns]
test_df=test_df.loc[:,new_columns]

take out More columns

In [125]:
train_df.drop(['LotFrontage','FireplaceQu'],axis=1,inplace=True)
test_df.drop(['LotFrontage','FireplaceQu'],axis=1,inplace=True)

In [126]:
train_df.drop(['MasVnrType','MasVnrArea'],axis=1,inplace=True)
test_df.drop(['MasVnrType','MasVnrArea'],axis=1,inplace=True)

In [127]:
train_df.drop(['BsmtFullBath'],axis=1,inplace=True)
test_df.drop(['BsmtFullBath'],axis=1,inplace=True)

Fill na for GarageArea, Garagecars, Exterior2nd, Exterior 1st, BsmtFinSf1, KitchenQual, TotalBsmtSF, BsmtUnfSF

In [128]:
test_df.GarageArea.fillna(test_df.GarageArea.mean(),inplace=True)
test_df.GarageCars.fillna(test_df.GarageCars.mean(),inplace=True)
test_df.Exterior2nd.fillna(test_df.Exterior2nd.value_counts().index[0],inplace=True)
test_df.MSZoning.fillna(method='backfill',inplace=True)
test_df.MSZoning.fillna(method='backfill',inplace=True)
test_df.Exterior1st.fillna(test_df.Exterior1st.value_counts().index[0],inplace=True)
test_df.BsmtFinSF1.fillna(test_df.BsmtFinSF1.mean(),inplace=True)
test_df.KitchenQual.fillna(test_df.KitchenQual.value_counts().index[0],inplace=True)
test_df.TotalBsmtSF.fillna(test_df.TotalBsmtSF.mean(),inplace=True)
test_df.BsmtUnfSF.fillna(test_df.BsmtUnfSF.mean(),inplace=True)

In [129]:
train_df.GarageType.fillna(train_df.GarageType.value_counts().index[0],inplace=True)
train_df.GarageFinish.fillna(train_df.GarageFinish.value_counts().index[0],inplace=True)
train_df.GarageYrBlt.fillna(train_df.GarageYrBlt.value_counts().index[0],inplace=True)

In [130]:
test_df.GarageType.fillna(test_df.GarageType.value_counts().index[0],inplace=True)
test_df.GarageFinish.fillna(test_df.GarageFinish.value_counts().index[0],inplace=True)
test_df.GarageYrBlt.fillna(test_df.GarageYrBlt.value_counts().index[0],inplace=True)

In [131]:
train_df.BsmtExposure.fillna(method='backfill',inplace=True)
train_df.BsmtFinType1.fillna(method='backfill',inplace=True)
train_df.BsmtQual.fillna(method='backfill',inplace=True)

In [132]:
test_df.BsmtExposure.fillna(method='backfill',inplace=True)
test_df.BsmtFinType1.fillna(method='backfill',inplace=True)
test_df.BsmtQual.fillna(method='backfill',inplace=True)

Split columns into categorical and numerical

In [133]:
train_df.drop(columns=['Id'],inplace=True)
y_train=train_df.loc[:,'SalePrice']
x_train=train_df.drop(columns=['SalePrice'])

In [134]:
test_df.drop(columns=['Id'],inplace=True)
y_test=test_df.loc[:,'SalePrice']
x_test=test_df.drop(columns=['SalePrice'])

In [135]:
cat_df=x_train.select_dtypes(exclude=[np.number])
num_df=x_train.select_dtypes(include=[np.number])

In [136]:
x_train_train,x_train_test,y_train_train,y_train_test=train_test_split(x_train,y_train,test_size=0.2,random_state=42)

In [137]:
encoder=OrdinalEncoder()
x_train_train.loc[:,cat_df.columns]=encoder.fit_transform(x_train_train.loc[:,cat_df.columns])
x_train_test.loc[:,cat_df.columns]=encoder.transform(x_train_test.loc[:,cat_df.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [138]:
x_test.loc[:,cat_df.columns]=encoder.transform(x_test.loc[:,cat_df.columns])

In [139]:
scaler=StandardScaler()
x_train_train.loc[:,num_df.columns]=scaler.fit_transform(x_train_train.loc[:,num_df.columns])
x_train_test.loc[:,num_df.columns]=scaler.fit_transform(x_train_test.loc[:,num_df.columns])
x_test.loc[:,num_df.columns]=scaler.transform(x_test.loc[:,num_df.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Feature selection

In [140]:
selector=SelectKBest(score_func=f_regression,k='all')
selector.fit(x_train_train,y_train_train)

SelectKBest(k='all', score_func=<function f_regression at 0x7f99a8b0fb90>)

In [141]:
scores=selector.scores_
index=np.arange(0,len(scores))
score_list=list(zip(index,scores))
score_list.sort(key=lambda x:x[1],reverse=True)
score_list=score_list[:20]

In [142]:
selected_columns=list(list(zip(*score_list))[0])
x_train_train=x_train_train.iloc[:,selected_columns]
x_train_test=x_train_test.iloc[:,selected_columns]

In [193]:
x_test=x_test.loc[:,x_train.columns]

Linear Regression

In [144]:
from sklearn.linear_model import Ridge
ridge_regressor=Ridge()
ridge_regressor.fit(x_train_train,y_train_train)
ridge_regressor.score(x_train_test,y_train_test)

0.8308017287264006

In [145]:
from sklearn.svm import SVR
svr_regressor=SVR(kernel='linear',epsilon=5,C=1000)
svr_regressor.fit(x_train_train,y_train_train)
svr_regressor.score(x_train_test,y_train_test)

0.8333187351124104

In [180]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor=RandomForestRegressor(n_estimators=150,criterion='mse',max_depth=10)
rf_regressor.fit(x_train_train,y_train_train)
rf_regressor.score(x_train_test,y_train_test)

0.881667435343563

In [175]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = np.arange(100,500,50)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
# Minimum number of samples required to split a node
min_samples_split = [5,7,10,13]
# Minimum number of samples required at each leaf node
min_samples_leaf = [5,7,10]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, scoring='neg_mean_squared_log_error',param_distributions = random_grid, n_iter = 200, cv = 2, verbose=2, random_state=42, n_jobs = 6)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    4.9s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   22.9s
[Parallel(n_jobs=6)]: Done 353 tasks      | elapsed:   53.1s
[Parallel(n_jobs=6)]: Done 400 out of 400 | elapsed:   59.7s finished


RandomizedSearchCV(cv=2, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [176]:
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 7,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 43}

In [190]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor=RandomForestRegressor(n_estimators=200,criterion='mse',max_depth=15)
rf_regressor.fit(x_train_train,y_train_train)
rf_regressor.score(x_train_test,y_train_test)

0.8855247314455157

In [178]:
rf_regressor.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=43, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=7, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [191]:
from sklearn.metrics import mean_squared_error
y_pred=rf_regressor.predict(x_train_test)
mean_squared_error(np.log(y_pred),np.log(y_train_test),squared=False)

0.16527090018901316

In [194]:
rf_regressor.predict(x_test)

array([126542.3062062 , 153812.11248994, 182234.99367299, ...,
       154765.22430869, 115781.17985427, 225448.58772727])

In [196]:
x_test['SalePrice']=rf_regressor.predict(x_test)

In [207]:
x_test['Id']=x_test.index+1461

In [209]:
x_test[['Id','SalePrice']].to_csv('result.csv')

In [208]:
x_test

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,ExterQual,TotalBsmtSF,KitchenQual,1stFlrSF,BsmtQual,FullBath,...,YearBuilt,YearRemodAdd,Fireplaces,HeatingQC,GarageYrBlt,Foundation,BsmtFinSF1,WoodDeckSF,SalePrice,Id
0,-0.701882,-1.084947,-0.917665,1.207184,3.0,-0.367329,3.0,-0.622535,3.0,-0.916735,...,-0.406909,-1.170184,-0.922263,4.0,-0.749447,1.0,0.076364,0.496225,126542.306206,1461
1,-0.007138,-0.263237,-0.917665,-0.660366,3.0,0.671481,2.0,0.501953,3.0,-0.916735,...,-0.513280,-1.318061,-0.922263,4.0,-0.871249,1.0,1.101393,2.884599,153812.112490,1462
2,-0.701882,0.306078,0.376819,0.099164,3.0,-0.260427,3.0,-0.539432,2.0,0.856025,...,0.869553,0.653629,0.634389,2.0,0.712183,2.0,0.804022,1.175921,182234.993673,1463
3,-0.007138,0.258635,0.376819,0.045550,3.0,-0.265075,2.0,-0.544626,3.0,0.856025,...,0.905010,0.653629,0.634389,0.0,0.752784,2.0,0.378241,2.573072,193150.583842,1464
4,1.382350,-0.356225,0.376819,0.206392,2.0,0.557607,2.0,0.374702,2.0,0.856025,...,0.692267,0.357875,-0.922263,0.0,0.509179,2.0,-0.385462,-0.825404,211620.967361,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-1.396626,-0.712995,-2.212148,-2.054326,3.0,-1.148179,3.0,-1.531474,3.0,-0.916735,...,-0.087793,-0.726554,-0.922263,2.0,1.036990,1.0,-0.977951,-0.825404,91583.409933,2915
1455,-1.396626,-0.712995,-0.917665,-0.776529,3.0,-1.148179,3.0,-1.531474,3.0,-0.916735,...,-0.087793,-0.726554,-0.922263,4.0,-0.384039,1.0,-0.410243,-0.825404,102572.073202,2916
1456,-0.701882,-0.462497,0.376819,0.519140,3.0,0.427465,3.0,0.229272,3.0,-0.916735,...,-0.442366,0.555044,0.634389,0.0,-0.790048,1.0,1.779489,3.649256,154765.224309,2917
1457,-0.701882,-0.944516,-2.212148,-2.054326,3.0,-0.297610,3.0,-0.430359,2.0,-0.916735,...,0.692267,0.357875,-0.922263,4.0,1.036990,2.0,-0.218754,-0.070187,115781.179854,2918


In [206]:
x_test.shape

(1459, 22)