In [51]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MultipleLocator
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import Common_Functions as cmn

# Combine All Regressions into one Graph

In [52]:
sns.set_context("poster")
sns.set_style("ticks")

In [53]:
prices = pd.read_csv("Exported_Data/RandomForest_for_weights.csv", index_col=0)
prices = prices.rename(columns={'0':'Random Forest Prediction'})
#prices["Random Forest Prediction"] = rf.iloc[:,-1:]

gbr = pd.read_csv("Exported_Data/GBR_for_weights.csv")
prices["Gradient Boosted Regressor Prediction"] = gbr.iloc[:,-1:]

ols = pd.read_csv("Exported_Data/OLS_for_weights.csv")
prices["OLS Prediction"] = ols.iloc[:,-1:]

xgb = pd.read_csv("Exported_Data/XGB_for_weights.csv")
prices["XGBoost Regressor Prediction"] = xgb.iloc[:,-1:]

prices.head(10)

Unnamed: 0,Random Forest Prediction,Gradient Boosted Regressor Prediction,OLS Prediction,XGBoost Regressor Prediction
0,125856.5575,127032.322156,119200.767332,122261.78
1,151943.335,154184.388463,151923.475793,149681.34
2,182550.9975,206040.814062,172606.277003,179451.86
3,184554.2975,193975.265929,199960.528299,182765.66
4,192816.9925,182797.925233,194639.025564,182455.05
5,185526.7425,187000.169753,181726.200135,176235.19
6,177600.815,195398.769165,198068.6368,163989.06
7,175902.3,175374.020482,169915.927191,167114.17
8,178982.62,173580.152563,206903.596145,172366.03
9,123154.55,114116.880539,116041.966118,115120.89


# Weightings based on Perceived Accuracy

In [54]:
weight = [.93*2, .94*3, .9*1, .95*4]
columns = ["Random Forest Prediction", "Gradient Boosted Regressor Prediction", "OLS Prediction", "XGBoost Regressor Prediction"]
prices["Weighted Average"] = np.average(prices[columns], weights=weight, axis=1)

prices.head(10)

Unnamed: 0,Random Forest Prediction,Gradient Boosted Regressor Prediction,OLS Prediction,XGBoost Regressor Prediction,Weighted Average
0,125856.5575,127032.322156,119200.767332,122261.78,124115.117274
1,151943.335,154184.388463,151923.475793,149681.34,151698.80584
2,182550.9975,206040.814062,172606.277003,179451.86,187403.269542
3,184554.2975,193975.265929,199960.528299,182765.66,188140.21607
4,192816.9925,182797.925233,194639.025564,182455.05,185781.883605
5,185526.7425,187000.169753,181726.200135,176235.19,181840.887194
6,177600.815,195398.769165,198068.6368,163989.06,179401.092331
7,175902.3,175374.020482,169915.927191,167114.17,171608.869534
8,178982.62,173580.152563,206903.596145,172366.03,177356.914068
9,123154.55,114116.880539,116041.966118,115120.89,116500.44964


In [55]:
#prices["Weighted Average"] = (prices["Random Forest Prediction"]* 0.15)+ (prices["Gradient Boosted Regressor Prediction"]*0.2)+ (prices["OLS Prediction"]*0.05)+ (prices["XGBoost Regressor Prediction"]*0.6)

#prices.head(10)

In [56]:
#weighted = prices["Weighted Average"]
weighted = pd.DataFrame(i for i in range(1461,2920))
weighted = weighted.rename(columns={0:'Id'})
weighted["SalePrice"] = prices['Weighted Average'].round(2)
weighted = weighted.set_index("Id")

pd.DataFrame(weighted).to_csv(f"{cmn.export_data_path}Weighted Submission.csv")

In [57]:
weighted

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,124115.12
1462,151698.81
1463,187403.27
1464,188140.22
1465,185781.88
...,...
2915,77483.38
2916,79929.87
2917,169925.87
2918,102934.83


In [58]:
#weighted = prices["Weighted Average"]
xgb_sub = pd.DataFrame(i for i in range(1461,2920))
xgb_sub = xgb_sub.rename(columns={0:'Id'})
xgb_sub["SalePrice"] = prices['XGBoost Regressor Prediction']
xgb_sub = xgb_sub.set_index("Id")

pd.DataFrame(xgb_sub).to_csv(f"{cmn.export_data_path}xgb_sub.csv")

In [59]:
df_top = pd.read_csv("Exported_Data/submission_regression_top0.3.csv", index_col="Id")
df_top = df_top.rename(columns={"SalePrice":"Top 0.3%"})
weighted = weighted.rename(columns={"SalePrice":"Our Weighted Result"})
display(df_top.head(10), weighted.head(10))

Unnamed: 0_level_0,Top 0.3%
Id,Unnamed: 1_level_1
1461,122344.0
1462,161549.0
1463,184424.0
1464,195709.0
1465,187205.0
1466,172682.0
1467,173095.0
1468,165704.0
1469,187760.0
1470,126938.0


Unnamed: 0_level_0,Our Weighted Result
Id,Unnamed: 1_level_1
1461,124115.12
1462,151698.81
1463,187403.27
1464,188140.22
1465,185781.88
1466,181840.89
1467,179401.09
1468,171608.87
1469,177356.91
1470,116500.45


In [60]:
weighted["Top 0.3% Result"] = df_top
weighted.head(10)

Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1461,124115.12,122344.0
1462,151698.81,161549.0
1463,187403.27,184424.0
1464,188140.22,195709.0
1465,185781.88,187205.0
1466,181840.89,172682.0
1467,179401.09,173095.0
1468,171608.87,165704.0
1469,177356.91,187760.0
1470,116500.45,126938.0


In [68]:
weighted["% Difference"] = ((weighted.loc[:,"Our Weighted Result"]/weighted.loc[:,"Top 0.3% Result"] - 1)).round(4)
weighted.head(10)

Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result,% Difference
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1461,124115.12,122344.0,0.0145
1462,151698.81,161549.0,-0.061
1463,187403.27,184424.0,0.0162
1464,188140.22,195709.0,-0.0387
1465,185781.88,187205.0,-0.0076
1466,181840.89,172682.0,0.053
1467,179401.09,173095.0,0.0364
1468,171608.87,165704.0,0.0356
1469,177356.91,187760.0,-0.0554
1470,116500.45,126938.0,-0.0822


In [63]:
weighted.value_counts("% Difference")

% Difference
 1.50     5
 0.95     4
-0.76     4
 2.48     4
-1.55     4
         ..
-2.86     1
-2.85     1
-2.79     1
-2.78     1
 64.67    1
Length: 1062, dtype: int64