In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [6]:
K = 4
splited_tranining_set_idx = np.load('splited_training_set.npy')

In [7]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [13]:
X_training = train.values[:,:-2]
Y_training = train.values[:,-2]
X_testing = test.values[:,:]

In [188]:
class OurLinear:
    def __init__(self):
        pass
        
    def fit(self, X, y):
        leftmat = np.linalg.inv(X.T @ X)
        self.betas = leftmat @ X.T @ y
    
    def predict(self, X):
        return X @ self.betas

In [189]:
class OurRidge:
    def __init__(self, alpha):
        self.alpha = alpha
        
    def fit(self, X, y):
        leftmat = np.linalg.inv(X.T @ X + self.alpha * np.identity(X.shape[1]))
        self.betas = leftmat @ X.T @ y
    
    def predict(self, X):
        return X @ self.betas

# Stacking First Level Models Training

In [167]:
def run_first_layer_single_model(model):

    if K>splited_tranining_set_idx.shape[0]:
        raise Exception('Data split file error!')

    train_predict = [] # Store the prediction of training set
    test_predict = [] # Store the prediction of testing set
    
    for i in range(K):

        current_train_idx = splited_tranining_set_idx[i][0]
        current_test_idx = splited_tranining_set_idx[i][1]

        X = X_training[current_train_idx]
        Y = Y_training[current_train_idx]
        model.fit(X,Y) #model training method
        
        X_ = X_training[current_test_idx]
        y = model.predict(X_) #model predicting method
        train_predict.append(y)

        y = model.predict(X_testing)
        test_predict.append(y)

    '''
    Attention: No worry about the order of prediction data. It is the same as the original.
    '''
    train_predict = np.array(train_predict).reshape((-1))
    test_predict = np.mean(np.array(test_predict), axis=0)
   
    np.save('stack_train_OurRidge.npy', train_predict)
    np.save('stack_test_OurRidge.npy', test_predict)
    #np.save('stack_train_linear.npy', train_predict)
    #np.save('stack_test_linear.npy', test_predict)
    #np.save('stack_train_ridge.npy', train_predict)
    #np.save('stack_test_ridge.npy', test_predict)
    #np.save('stack_train_lasso.npy', train_predict)
    #np.save('stack_test_lasso.npy', test_predict)
    #np.save('stack_train_ela.npy', train_predict)
    #np.save('stack_test_ela.npy', test_predict)
    #np.save('stack_train_svr.npy', train_predict)
    #np.save('stack_test_svr.npy', test_predict)
    #np.save('stack_train_XGboost.npy', train_predict)
    #np.save('stack_test_XGboost.npy', test_predict)
    #np.save('stack_train_random.npy', train_predict)
    #np.save('stack_test_random.npy', test_predict)

    return train_predict, test_predict

In [151]:
run_first_layer_single_model(OurRidge(alpha = 0.6))

(array([12.24023724, 12.15026235, 12.28159158, ..., 12.63147261,
        11.8632802 , 11.99461195]),
 array([11.66833974, 11.9455894 , 12.19393371, ..., 12.24789493,
        11.78391931, 12.30328835]))

In [31]:
from sklearn.linear_model import Ridge
run_first_layer_single_model(Ridge(alpha=3.3))

(array([12.24727721, 12.08575467, 12.29735278, ..., 12.5378914 ,
        11.83731524, 11.91291581]),
 array([11.65125401, 11.9027024 , 12.15443032, ..., 12.07493287,
        11.70601288, 12.29649916]))

In [168]:
from sklearn import linear_model
run_first_layer_single_model(linear_model.Lasso(alpha=0.005))

(array([12.26927438, 12.08381325, 12.34514302, ..., 12.37111328,
        11.79630043, 11.9076671 ]),
 array([11.67504294, 11.93120696, 12.22430572, ..., 12.08630889,
        11.73044686, 12.38561072]))

In [8]:
from sklearn.linear_model import ElasticNet
run_first_layer_single_model(ElasticNet(alpha=3.305,l1_ratio=0.005/3.305))

(array([12.26117799, 11.97588199, 12.28077852, ..., 12.09081417,
        11.82912887, 12.05422902]),
 array([11.90529965, 12.13902319, 12.21172229, ..., 11.93254249,
        11.99474709, 12.34478927]))

In [35]:
from sklearn.ensemble import RandomForestRegressor
run_first_layer_single_model(RandomForestRegressor(n_estimators=100, max_depth=50))

(array([12.22606546, 12.0221238 , 12.23136315, ..., 12.40806222,
        11.81158238, 11.92865126]),
 array([11.71135305, 11.93760636, 12.09517533, ..., 11.94733236,
        11.6444984 , 12.37434051]))

# Stacking Second Level Models Training

In [8]:
def data_concatenate(train_file_path, test_file_path):
	# This function is untested. 	
	X=[]
	X_=[]
	for train_file, test_file in zip(train_file_path, test_file_path):
		X.append(np.load(train_file))
		X_.append(np.load(test_file))
		
	X = np.array(X).T
	X_= np.array(X_).T
	
	return X, X_ 

In [197]:
def run_second_layer_model(model):
	#file_path_list. Add the path of all the result from first layer regressor.
	#Be symmetrical.
	train_file_path=['stack_train_svr.npy', 'stack_train_random.npy','stack_train_linear.npy','stack_train_OurRidge.npy','stack_train_lasso.npy','stack_train_XGboost.npy'] 
	test_file_path=['stack_test_svr.npy','stack_test_random.npy','stack_test_linear.npy','stack_test_OurRidge.npy','stack_test_lasso.npy','stack_test_XGboost.npy']
	
	X, X_ = data_concatenate(train_file_path, test_file_path)
	Y = Y_training
	
	model.fit(X, Y)
	result = model.predict(X_)

	return result

In [198]:
from sklearn.linear_model import LinearRegression
result = run_second_layer_model(LinearRegression())

In [199]:
result

array([11.67852089, 11.96001317, 12.1525685 , ..., 12.06660173,
       11.68639331, 12.31652302])

# Save Single Model Predicted Result for Kaggle Submission

In [200]:
sub = np.exp(result).reshape((1459,1))

In [201]:
sub

array([[118009.55641201],
       [156375.14402067],
       [189580.40253807],
       ...,
       [173963.66495771],
       [118942.24402732],
       [223356.18583881]])

In [202]:
sub1 = pd.DataFrame(sub,columns = ['SalePrice'])

In [203]:
sub2 = pd.DataFrame(np.arange(1461,2920),columns = ['Id'])
sub2['SalePrice'] = sub1['SalePrice']

In [204]:
sub2.to_csv('final_stack_result.csv',index = False)

# Cross Validation for second level model -- Linear Reg is the best

In [14]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, x, y, scoring="neg_mean_squared_error", cv = 4))
    return(rmse)

In [15]:
def CV(model):
    CV_score = []
    for i in range(K):
        current_train_idx = splited_tranining_set_idx[i][0]
        current_test_idx = splited_tranining_set_idx[i][1]
        X0 = X[current_train_idx]
        Y0 = Y[current_train_idx]
    
        reg_cur = model
        reg_cur.fit(X0, Y0)
    
        X0_ = X[current_test_idx]
        Y0_ = reg_cur.predict(X0_)
    
        Y0_real = Y[current_test_idx]
    
        cuurent_score = np.sqrt(sum((Y0_ - Y0_real)**2)/len(Y0_real))
        CV_score += [cuurent_score]
        return np.mean(CV_score)

In [22]:
#train_file_path=['stack_train_svr.npy', 'stack_train_random.npy','stack_train_ridge.npy','stack_train_lasso.npy','stack_train_ela.npy'] 
#test_file_path=['stack_test_svr.npy','stack_test_random.npy','stack_test_ridge.npy','stack_test_lasso.npy','stack_test_ela.npy']
train_file_path=['stack_train_svr.npy', 'stack_train_random.npy','stack_train_ridge.npy','stack_train_lasso.npy','stack_train_XGboost.npy','stack_train_ela.npy'] 
test_file_path=['stack_test_svr.npy','stack_test_random.npy','stack_train_ridge.npy','stack_test_lasso.npy','stack_test_XGboost.npy','stack_test_ela.npy']

X, X_ = data_concatenate(train_file_path, test_file_path)
Y = Y_training

In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
from sklearn.linear_model import LinearRegression
np.mean(np.sqrt(-cross_val_score(LinearRegression(), X, Y, scoring="neg_mean_squared_error", cv = 4)))

0.11201154130289363

In [108]:
from sklearn.linear_model import Ridge
np.mean(np.sqrt(-cross_val_score(Ridge(alpha=1), X, Y, scoring="neg_mean_squared_error", cv = 4)))

0.11202305292524987

In [111]:
from sklearn import linear_model
np.mean(np.sqrt(-cross_val_score(linear_model.Lasso(alpha=0.0001), X, Y, scoring="neg_mean_squared_error", cv = 4)))

0.11204481785339608

In [112]:
np.mean(np.sqrt(-cross_val_score(SVR(), X, Y, scoring="neg_mean_squared_error", cv = 4)))

0.11343944808611783

In [113]:
from sklearn.ensemble import RandomForestRegressor
np.mean(np.sqrt(-cross_val_score(RandomForestRegressor(), X, Y, scoring="neg_mean_squared_error", cv = 4)))

0.12497580123606408

In [115]:
CV(OurLinear())

0.09967050347764243

In [119]:
CV(OurRidge(alpha = 0.))

0.09967342555231554