<a href="https://colab.research.google.com/github/zhiyuan-95/House-Prices/blob/main/stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
train = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/train_encoded.csv')
test = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/test_encoded.csv')

In [None]:
train.shape

(1444, 315)

In [None]:
class optimizer:
  def __init__(self, model):
    self._model = model()
    self._model_name = type(self._model).__name__
    self._model_params = self._model._get_param_names()
  def update_record(self,params,score):
    records = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._model_name}.csv')
    records.loc[len(records)] = params
    records.loc[len(records)-1,'score'] = score
    records.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._model_name}.csv', index = False)
  def reset(self):
    df = pd.DataFrame(columns = self._model_params+['score'])
    df.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._model_name}.csv', index = False)
  def best_param(self):
    records = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._model_name}.csv')
    return records.sort_values(by = 'score', ascending=False).iloc[0,:]
  def top10(self):
    records = pd.read_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/{self._model_name}.csv')
    return records.sort_values(by = 'score', ascending=False).iloc[:10,:]

In [None]:
target = train[['SalePrice']]
feature = train.drop(columns = ['SalePrice'])
#target['Id'] = feature['Id']

### level 1 prediction

*   At the level 1 prediction stage, the dataset is partitioned into threesegments. Each specific model is trained on two of these segments and utilized to predict the third segment consecutively, repeating this process three times.
*   This methodology aims to generate comprehensive predictions for the entire dataset, augmenting it with the predictions derived from each specific model as new features. therefore I can combine the advantage of different model


In [None]:
X_23, X_1, y_23, y_1 = train_test_split(feature,target, test_size=0.335, random_state=13)
X_2,X_3,y_2,y_3 = train_test_split(X_23,y_23, test_size=0.5, random_state=17)

In [None]:
tool1 = {1:(X_1,y_1),2:(X_2,y_2),3:(X_3,y_3)}
tool2 = {1:[2,3],2:[1,3],3:[1,2]}

In [None]:
import math
def remove(n):
  if type(n)==str:return True
  else:
    if math.isnan(n): return False
    else: return True
def partial_prediction(model,n):
  feature_1,feature_2 = tool1[tool2[n][0]][0],tool1[tool2[n][1]][0]
  target_1,target_2 = tool1[tool2[n][0]][1],tool1[tool2[n][1]][1]
  x_train,y_train = pd.concat([feature_1,feature_2], axis = 0), pd.concat([target_1,target_2], axis=0)['SalePrice']
  best_params = {key:value for key,value in opt.best_param()[:-1].items() if remove(value)}
  md = model(**best_params)
  md.fit(x_train,y_train)
  return pd.DataFrame({'SalePrice':md.predict(tool1[n][0])})

In [None]:
# add level 1 prediction to test set
models = [GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor]
new_features = pd.DataFrame()
for m in models:
  print(m.__name__)
  opt = optimizer(m)
  best_params = {key:value for key,value in opt.best_param()[:-1].items() if remove(value)}
  md = m(**best_params)
  md.fit(feature, target['SalePrice'])
  new_features[m.__name__] = md.predict(test)

GradientBoostingRegressor
AdaBoostRegressor
RandomForestRegressor


In [None]:
test = pd.concat([test,new_features], axis=1)

In [None]:
# add level1 prediction as new features to the original feature of training set
for m in models:
  print(m.__name__)
  opt = optimizer(m)
  new_feature = pd.concat([partial_prediction(m,i+1) for i in range(3)], axis=0)
  new_feature = new_feature.sort_index()
  feature[m.__name__] = list(new_feature['SalePrice'])

GradientBoostingRegressor
AdaBoostRegressor
RandomForestRegressor


In [None]:
opt = optimizer(GradientBoostingRegressor)
best_params = {key:value for key,value in opt.best_param()[:-1].items() if remove(value)}
M = GradientBoostingRegressor(**best_params)
M.fit(feature, target['SalePrice'])

In [None]:
feature.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/feature_train_stacked.csv', index = False)
test.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/test_stacked.csv', index=False)
target.to_csv(f'/content/drive/MyDrive/Colab/kaggle/HousePrices/target_tran.csv', index=False)

In [None]:
prediction = M.predict(test)
result =pd.DataFrame({'Id':[int(1461+x) for x in range(len(prediction))], 'SalePrice':prediction})
result.to_csv('submission.csv',index=False )

In [None]:
from google.colab import files
# Upload the kaggle.json file
files.upload()
!mkdir /root/.kaggle/
!cp kaggle.json /root/.kaggle/

In [None]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 33.7k/33.7k [00:02<00:00, 16.8kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques