In [26]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MultipleLocator
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import Common_Functions as cmn

# Combine All Regressions into one Graph

In [27]:
sns.set_context("poster")
sns.set_style("ticks")

In [28]:
prices = pd.read_csv("Exported_Data/RandomForest_for_weights.csv", index_col=0)
prices = prices.rename(columns={'0':'Random Forest Prediction'})
#prices["Random Forest Prediction"] = rf.iloc[:,-1:]

gbr = pd.read_csv("Exported_Data/GBR_for_weights.csv")
prices["Gradient Boosted Regressor Prediction"] = gbr.iloc[:,-1:]

ols = pd.read_csv("Exported_Data/OLS_for_weights.csv")
prices["OLS Prediction"] = ols.iloc[:,-1:]

xgb = pd.read_csv("Exported_Data/XGB_for_weights.csv")
prices["XGBoost Regressor Prediction"] = xgb.iloc[:,-1:]

prices.head(10)

Unnamed: 0,Random Forest Prediction,Gradient Boosted Regressor Prediction,OLS Prediction,XGBoost Regressor Prediction
0,125856.5575,125570.315973,119200.767332,125977.64
1,151943.335,154093.200648,151923.475793,151896.97
2,182550.9975,183450.048311,172606.277003,184050.48
3,184554.2975,185738.899302,199960.528299,186258.31
4,192816.9925,183531.554327,194639.025564,187266.45
5,185526.7425,183799.217826,181726.200135,177178.4
6,177600.815,180363.432167,198068.6368,170283.56
7,175902.3,174935.516191,169915.927191,169917.28
8,178982.62,177594.695931,206903.596145,175673.75
9,123154.55,121084.494759,116041.966118,114157.37


# Weightings based on Perceived Accuracy

In [29]:
weight = [.93*2, .94*3, .9*1, .95*4]
columns = ["Random Forest Prediction", "Gradient Boosted Regressor Prediction", "OLS Prediction", "XGBoost Regressor Prediction"]
prices["Weighted Average"] = np.average(prices[columns], weights=weight, axis=1)

prices.head(10)

Unnamed: 0,Random Forest Prediction,Gradient Boosted Regressor Prediction,OLS Prediction,XGBoost Regressor Prediction,Weighted Average
0,125856.5575,125570.315973,119200.767332,125977.64,125180.939296
1,151943.335,154093.200648,151923.475793,151896.97,152568.981145
2,182550.9975,183450.048311,172606.277003,184050.48,182474.569818
3,184554.2975,185738.899302,199960.528299,186258.31,187078.970453
4,192816.9925,183531.554327,194639.025564,187266.45,187951.622842
5,185526.7425,183799.217826,181726.200135,177178.4,181260.664759
6,177600.815,180363.432167,198068.6368,170283.56,177430.884406
7,175902.3,174935.516191,169915.927191,169917.28,172612.626027
8,178982.62,177594.695931,206903.596145,175673.75,179903.859516
9,123154.55,121084.494759,116041.966118,114157.37,118204.85221


In [30]:
#prices["Weighted Average"] = (prices["Random Forest Prediction"]* 0.15)+ (prices["Gradient Boosted Regressor Prediction"]*0.2)+ (prices["OLS Prediction"]*0.05)+ (prices["XGBoost Regressor Prediction"]*0.6)

#prices.head(10)

In [35]:
#weighted = prices["Weighted Average"]
weighted = pd.DataFrame(i for i in range(1461,2920))
weighted = weighted.rename(columns={0:'Id'})
weighted["SalePrice"] = prices['Weighted Average'].round(2)
weighted = weighted.set_index("Id")

pd.DataFrame(weighted).to_csv(f"{cmn.export_data_path}Weighted Submission.csv")

In [36]:
weighted

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,125180.94
1462,152568.98
1463,182474.57
1464,187078.97
1465,187951.62
...,...
2915,78651.25
2916,81884.55
2917,171834.67
2918,107881.69


In [37]:
#weighted = prices["Weighted Average"]
xgb_sub = pd.DataFrame(i for i in range(1461,2920))
xgb_sub = xgb_sub.rename(columns={0:'Id'})
xgb_sub["SalePrice"] = prices['XGBoost Regressor Prediction']
xgb_sub = xgb_sub.set_index("Id")

pd.DataFrame(xgb_sub).to_csv(f"{cmn.export_data_path}xgb_sub.csv")

In [38]:
df_top = pd.read_csv("Exported_Data/xgb_sub.csv", index_col="Id")
df_top = df_top.rename(columns={"SalePrice":"Top 0.3%"})
weighted = weighted.rename(columns={"SalePrice":"Our Weighted Result"})
display(df_top.head(10), weighted.head(10))

Unnamed: 0_level_0,Top 0.3%
Id,Unnamed: 1_level_1
1461,125977.64
1462,151896.97
1463,184050.48
1464,186258.31
1465,187266.45
1466,177178.4
1467,170283.56
1468,169917.28
1469,175673.75
1470,114157.37


Unnamed: 0_level_0,Our Weighted Result
Id,Unnamed: 1_level_1
1461,125180.94
1462,152568.98
1463,182474.57
1464,187078.97
1465,187951.62
1466,181260.66
1467,177430.88
1468,172612.63
1469,179903.86
1470,118204.85


In [39]:
weighted["Top 0.3% Result"] = df_top
weighted.head(10)

Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1461,125180.94,125977.64
1462,152568.98,151896.97
1463,182474.57,184050.48
1464,187078.97,186258.31
1465,187951.62,187266.45
1466,181260.66,177178.4
1467,177430.88,170283.56
1468,172612.63,169917.28
1469,179903.86,175673.75
1470,118204.85,114157.37


In [47]:
weighted["% Difference"] = ((weighted.loc[:,"Our Weighted Result"]/weighted.loc[:,"Top 0.3% Result"] - 1)*100).round(2)
weighted.head(10)

Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result,% Difference
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1461,125180.94,125977.64,-0.63
1462,152568.98,151896.97,0.44
1463,182474.57,184050.48,-0.86
1464,187078.97,186258.31,0.44
1465,187951.62,187266.45,0.37
1466,181260.66,177178.4,2.3
1467,177430.88,170283.56,4.2
1468,172612.63,169917.28,1.59
1469,179903.86,175673.75,2.41
1470,118204.85,114157.37,3.55
