<a href="https://colab.research.google.com/github/zhiyuan-95/House-Prices/blob/main/stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [122]:
train = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/train_encoded.csv')
test = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/test_encoded.csv')
train.shape

(1452, 315)

In [123]:
class optimizer:
  def __init__(self, model,feature, target=None):
    self._model = model
    self._model_params = self._model._get_param_names()
    if feature.shape[1] == 314:
      self._name = self._model.__name__
      self._record_path = f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._name}.csv'
    else:
      self._name = f'{self._model.__name__}_{feature.shape[1]}'
      self._record_path = f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._name}.csv'
    self._X = feature
    self._y = target
  def update_record(self,params,score):
    records = pd.read_csv(self._record_path)
    records.loc[len(records)] = params
    records.loc[len(records)-1,'score'] = score
    records.to_csv(self._record_path, index = False)
  def reset(self):
    df = pd.DataFrame(columns = self._model_params+['score'])
    df.to_csv(self._record_path, index = False)
  def best_param(self):
    records = pd.read_csv(self._record_path)
    return records.sort_values(by = 'score', ascending=False).iloc[0,:]
  def top10(self):
    records = pd.read_csv(self._record_path)
    return records.sort_values(by = 'score', ascending=False).iloc[:10,:]
  def train(self,**params):
    Model = self._model(**params)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cross_val_results = cross_val_score(Model, self._X, list(self._y['SalePrice']), cv=kf, scoring = 'neg_root_mean_squared_error')
    score=round(cross_val_results.mean(),0)
    self.update_record(Model.get_params(),score)
    return score

In [124]:
target = train[['SalePrice']]
feature = train.drop(columns = ['SalePrice'])
#target['Id'] = feature['Id']
feature_set1 = ['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt','TotalBsmtSF','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','BldgType','HouseStype','GarageArea','PoolArea','1stFlrSF','2ndFlrSF','GrLivArea','FullBath','total_area','YearRemodAdd','MoSold','YrSold','SaleType','SaleCondition']
feature_set2 = ['MSZoning','Street','Alley','Utilities','LotConfig','Neighborhood','Condition1','Condition2','YearBuild','YearRemodAdd','LandSlope','BldgType','HouseStype','OverallQual','OverallCond','Foundation','MoSold','YrSold','SaleType','SaleCondition']
featureSet1 = []
featureSet2 = []
for x in feature_set1: featureSet1+=list(filter(lambda i: i[:len(x)]==x, feature))
for x in feature_set2: featureSet2+=list(filter(lambda i: i[:len(x)]==x, feature))
featureSet3 = [x for x in feature if x not in featureSet1+featureSet2]

### level 1 prediction

*   At the level 1 prediction stage, the dataset is partitioned into threesegments. Each specific model is trained on two of these segments and utilized to predict the third segment consecutively, repeating this process three times.
*   This methodology aims to generate comprehensive predictions for the entire dataset, augmenting it with the predictions derived from each specific model as new features. therefore I can combine the advantage of different model


In [101]:
# add level 1 prediction to test set
models = [RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,SVR]
feature_sets = [list(feature.columns),featureSet1,featureSet2,featureSet3]
new_features = pd.DataFrame()
for m in models:
  for f in feature_sets:
    opt = optimizer(m,feature[f],target)
    print(opt._name)
    best_params = {key:value for key,value in opt.best_param()[:-1].items() if remove(value)}
    md = m(**best_params)
    md.fit(feature[f], target['SalePrice'])
    new_features[opt._name] = md.predict(test[f])
test = pd.concat([test,new_features], axis=1)

RandomForestRegressor
RandomForestRegressor_46
RandomForestRegressor_97
RandomForestRegressor_198
GradientBoostingRegressor
GradientBoostingRegressor_46
GradientBoostingRegressor_97
GradientBoostingRegressor_198
AdaBoostRegressor
AdaBoostRegressor_46
AdaBoostRegressor_97
AdaBoostRegressor_198
SVR
SVR_46
SVR_97
SVR_198


In [128]:
def split(f,t):
  X_23, X_1, y_23, y_1 = train_test_split(f,t, test_size=0.335, random_state=17)
  X_2,X_3,y_2,y_3 = train_test_split(X_23,y_23, test_size=0.5, random_state=17)
  tool1 = {1:(X_1,y_1),2:(X_2,y_2),3:(X_3,y_3)}
  tool2 = {1:[2,3],2:[1,3],3:[1,2]}
  return tool1,tool2

In [129]:
import math
def remove(n):
  # when i retrieve the parameter from the record, some parameters are nan
  # so I have remove nan
  if type(n)==str:return True
  else:
    if math.isnan(n): return False
    else: return True
def partial_prediction(model,fe,ta):
  opt = optimizer(model,fe)
  tool1,tool2 = split(fe,ta)
  out = []
  for n in range(3):
    feature_1,feature_2 = tool1[tool2[n+1][0]][0],tool1[tool2[n+1][1]][0]
    target_1,target_2 = tool1[tool2[n+1][0]][1],tool1[tool2[n+1][1]][1]
    x_train,y_train = pd.concat([feature_1,feature_2], axis = 0), pd.concat([target_1,target_2], axis=0)['SalePrice']
    best_params = {key:value for key,value in opt.best_param()[:-1].items() if remove(value)}
    md = model(**best_params)
    md.fit(x_train,y_train)
    out.append(pd.DataFrame({'SalePrice':md.predict(tool1[n+1][0])}, index=tool1[n+1][1].index))
  return out

In [131]:
# add level1 prediction as new features to the original feature of training set
for m in models:
  for f in feature_sets:
    opt = optimizer(m,feature[f],target)

    new_feature = pd.concat(partial_prediction(m,feature[f],target), axis=0).sort_index()
    feature[opt._name] = list(new_feature['SalePrice'])

In [132]:
feature.shape

(1452, 330)

In [None]:
import random
opt_final = optimizer(GradientBoostingRegressor,feature,target)
print('n','es ', 'md','mss', 'score')
for _ in range(200):
  es = random.randint(300,1200)
  lr = round(random.uniform(0.01,0.1),4)
  md = random.randint(2,4)
  print(_+1,es, lr,md, end=' ')
  score = opt_final.train(n_estimators=es, learning_rate=lr,max_depth = md, max_features='sqrt')
  print(score)

n es  md mss score
1 430 0.0376 3 -21160.0
2 632 0.0685 3 -21647.0
3 540 0.0232 3 -21243.0
4 1197 0.0563 3 -21173.0
5 959 0.0223 2 -21179.0
6 342 0.073 4 -21888.0
7 963 0.0182 4 -21268.0
8 417 0.0279 4 -21425.0
9 775 0.0835 4 -21568.0
10 464 0.0883 4 -21835.0
11 722 0.0514 2 -21295.0
12 1187 0.0638 4 -21608.0
13 921 0.0578 4 -21545.0
14 830 0.0887 3 -21575.0
15 818 0.0356 3 -21165.0
16 833 0.0125 4 -21517.0
17 1066 0.0626 3 -21304.0
18 797 0.0227 3 -21047.0
19 918 0.0398 3 -21164.0
20 1031 0.092 3 -21520.0
21 1057 0.0486 2 -20913.0
22 1070 0.0422 3 -21042.0
23 458 0.0861 3 -21291.0
24 1200 0.0469 4 -21445.0
25 775 0.0716 4 -21794.0
26 1186 0.0979 4 -22069.0
27 1168 0.0355 4 -21278.0
28 607 0.0994 3 -21374.0
29 826 0.083 4 -21571.0
30 1191 0.0443 4 -21320.0
31 1157 0.0942 2 -22118.0
32 887 0.0356 4 -21351.0
33 617 0.0446 4 -21251.0
34 1103 0.0655 4 -21526.0
35 914 0.0568 3 -21596.0
36 736 0.0357 3 -21338.0
37 825 0.0498 3 -21174.0
38 1040 0.0712 2 -21800.0
39 359 0.061 2 -21953.0
40 936

In [133]:
feature.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/feature_train_stacked.csv', index = False)
test.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/test_stacked.csv', index=False)
target.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/target_tran.csv', index=False)

In [134]:
best_params = {key:value for key,value in opt_final.best_param()[:-1].items() if remove(value)}
M = GradientBoostingRegressor(**best_params)
M.fit(feature, target['SalePrice'])
prediction = M.predict(test)
result =pd.DataFrame({'Id':[int(1461+x) for x in range(len(prediction))], 'SalePrice':prediction})
result.to_csv('submission.csv',index=False )

In [135]:
from google.colab import files
# Upload the kaggle.json file
files.upload()
!mkdir /root/.kaggle/
!cp kaggle.json /root/.kaggle/

Saving kaggle.json to kaggle.json


In [136]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 33.7k/33.7k [00:01<00:00, 18.4kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques