In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MultipleLocator
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import Common_Functions as cmn

# Combine All Regressions into one Graph

In [2]:
sns.set_context("poster")
sns.set_style("ticks")

In [3]:
prices = pd.read_csv("Exported_Data/RandomForest_for_weights.csv", index_col=0)
prices = prices.rename(columns={'0':'Random Forest Prediction'})
#prices["Random Forest Prediction"] = rf.iloc[:,-1:]

gbr = pd.read_csv("Exported_Data/GBR_for_weights.csv")
prices["Gradient Boosted Regressor Prediction"] = gbr.iloc[:,-1:]

ols = pd.read_csv("Exported_Data/OLS_for_weights.csv")
prices["OLS Prediction"] = ols.iloc[:,-1:]

xgb = pd.read_csv("Exported_Data/XGB_for_weights.csv")
prices["XGBoost Regressor Prediction"] = xgb.iloc[:,-1:]

prices.head(10)

Unnamed: 0,Random Forest Prediction,Gradient Boosted Regressor Prediction,OLS Prediction,XGBoost Regressor Prediction
0,125856.5575,127032.322156,119200.767332,122261.78
1,151943.335,154184.388463,151923.475793,149681.34
2,182550.9975,206040.814062,172606.277003,179451.86
3,184554.2975,193975.265929,199960.528299,182765.66
4,192816.9925,182797.925233,194639.025564,182455.05
5,185526.7425,187000.169753,181726.200135,176235.19
6,177600.815,195398.769165,198068.6368,163989.06
7,175902.3,175374.020482,169915.927191,167114.17
8,178982.62,173580.152563,206903.596145,172366.03
9,123154.55,114116.880539,116041.966118,115120.89


# Weightings based on Perceived Accuracy

We take the weightings from the previous notebook to predict the testing data pricing

In [4]:
weight = [.93, .94, .9, .95]
columns = ["Random Forest Prediction", "Gradient Boosted Regressor Prediction", "OLS Prediction", "XGBoost Regressor Prediction"]
prices["Weighted Average"] = np.average(prices[columns], weights=weight, axis=1)

prices.head(10)

Unnamed: 0,Random Forest Prediction,Gradient Boosted Regressor Prediction,OLS Prediction,XGBoost Regressor Prediction,Weighted Average
0,125856.5575,127032.322156,119200.767332,122261.78,123625.366371
1,151943.335,154184.388463,151923.475793,149681.34,151927.158043
2,182550.9975,206040.814062,172606.277003,179451.86,185289.169139
3,184554.2975,193975.265929,199960.528299,182765.66,190205.402989
4,192816.9925,182797.925233,194639.025564,182455.05,188079.912164
5,185526.7425,187000.169753,181726.200135,176235.19,182606.731375
6,177600.815,195398.769165,198068.6368,163989.06,183573.919647
7,175902.3,175374.020482,169915.927191,167114.17,172076.213501
8,178982.62,173580.152563,206903.596145,172366.03,182682.834688
9,123154.55,114116.880539,116041.966118,115120.89,117098.444681


In [5]:
#prices["Weighted Average"] = (prices["Random Forest Prediction"]* 0.15)+ (prices["Gradient Boosted Regressor Prediction"]*0.2)+ (prices["OLS Prediction"]*0.05)+ (prices["XGBoost Regressor Prediction"]*0.6)

#prices.head(10)

In [6]:
#weighted = prices["Weighted Average"]
weighted = pd.DataFrame(i for i in range(1461,2920))
weighted = weighted.rename(columns={0:'Id'})
weighted["SalePrice"] = prices['Weighted Average'].round(2)
weighted = weighted.set_index("Id")

pd.DataFrame(weighted).to_csv(f"{cmn.export_data_path}Weighted Submission.csv")

In [7]:
weighted

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,123625.37
1462,151927.16
1463,185289.17
1464,190205.40
1465,188079.91
...,...
2915,77575.40
2916,76588.76
2917,172423.31
2918,105935.30


In [8]:
df_top = pd.read_csv("Exported_Data/submission_regression_top0.3.csv", index_col="Id")
df_top = df_top.rename(columns={"SalePrice":"Top 0.3%"})
weighted = weighted.rename(columns={"SalePrice":"Our Weighted Result"})
display(df_top.head(10), weighted.head(10))

Unnamed: 0_level_0,Top 0.3%
Id,Unnamed: 1_level_1
1461,122344.0
1462,161549.0
1463,184424.0
1464,195709.0
1465,187205.0
1466,172682.0
1467,173095.0
1468,165704.0
1469,187760.0
1470,126938.0


Unnamed: 0_level_0,Our Weighted Result
Id,Unnamed: 1_level_1
1461,123625.37
1462,151927.16
1463,185289.17
1464,190205.4
1465,188079.91
1466,182606.73
1467,183573.92
1468,172076.21
1469,182682.83
1470,117098.44


In [9]:
weighted["Top 0.3% Result"] = df_top
weighted.head(10)

Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1461,123625.37,122344.0
1462,151927.16,161549.0
1463,185289.17,184424.0
1464,190205.4,195709.0
1465,188079.91,187205.0
1466,182606.73,172682.0
1467,183573.92,173095.0
1468,172076.21,165704.0
1469,182682.83,187760.0
1470,117098.44,126938.0


In [16]:
weighted["% Difference"] = ((weighted.loc[:,"Our Weighted Result"]/weighted.loc[:,"Top 0.3% Result"] - 1)*100).round(2)
weighted.head(10)

Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result,% Difference
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1461,123625.37,122344.0,1.05
1462,151927.16,161549.0,-5.96
1463,185289.17,184424.0,0.47
1464,190205.4,195709.0,-2.81
1465,188079.91,187205.0,0.47
1466,182606.73,172682.0,5.75
1467,183573.92,173095.0,6.05
1468,172076.21,165704.0,3.85
1469,182682.83,187760.0,-2.7
1470,117098.44,126938.0,-7.75


In [17]:
abs(weighted["% Difference"]).describe()

count    1459.000000
mean        4.820980
std         4.650568
min         0.000000
25%         1.745000
50%         3.620000
75%         6.555000
max        60.370000
Name: % Difference, dtype: float64

In [18]:
y = np.log1p(weighted)
y

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


Unnamed: 0_level_0,Our Weighted Result,Top 0.3% Result,% Difference
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1461,11.725019,11.714600,0.717840
1462,11.931163,11.992570,
1463,12.129678,12.124998,0.385262
1464,12.155865,12.184389,
1465,12.144628,12.139965,0.385262
...,...,...,...
2915,11.259019,11.385433,
2916,11.246219,11.354445,
2917,12.057714,12.015929,1.662030
2918,11.570593,11.632041,


In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error
rmsweight = np.sqrt(mean_squared_error(y.loc[:,"Our Weighted Result"], y.loc[:,"Top 0.3% Result"]))
print(rmsweight)

0.06688267413574647
