# House Price Prediction-(Pipeline Method) 

In [None]:
!conda update --all -y

**Libraries used**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats
from sklearn.impute import KNNImputer, SimpleImputer
from scipy.stats.mstats import winsorize

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import set_config

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, MaxAbsScaler
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import RidgeCV, Ridge, LassoCV
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBRegressor

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (25, 15)
set_config(display='diagram')

**Read the input file nd display contents**

In [None]:
data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
data.drop(columns=['Id'], inplace=True)
data.head()

**List out all the features in the data that have undefined values**

In [None]:
data.columns[data.isna().any()].tolist()

In [None]:
data.info()

**Replace the NaN values  (Note: Not all NA are invalid)**

In [None]:
data['MasVnrType'] = data['MasVnrType'].replace(np.nan, data.MasVnrType.value_counts().index[0])
data['Electrical'] = data['Electrical'].replace(np.nan, data.Electrical.value_counts().index[0])

In [None]:
columns = data.drop(['SalePrice'], axis=1).columns
nonObjCols = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']).columns
toScale1 = [key for key, val in (data[nonObjCols].quantile(0.75) == 0.0).to_dict().items() if val == True] 
toScale2 = [val for val in nonObjCols if val not in toScale1]
ObjCols = data.select_dtypes(include=['object']).columns

**Pipelining**

In [None]:
numeric_features2 = toScale2
numeric_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
    ('scaler', RobustScaler())])

numeric_features1 = toScale1
numeric_transformer1 = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
    ('scaler', MaxAbsScaler())]) 
categorical_features = ObjCols
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical1', numeric_transformer1, numeric_features1),
        ('numerical2', numeric_transformer2, numeric_features2),
        ('categorical', categorical_transformer, categorical_features)],
        remainder='drop')

In [None]:
X = data.loc[:, data.columns != 'SalePrice']
y = data.SalePrice

In [None]:
ScaleTarget = RobustScaler()

**Splitting data into Train and Test sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=18)
y_train = ScaleTarget.fit_transform(y_train.values.reshape(-1,1))
y_test = ScaleTarget.transform(y_test.values.reshape(-1,1))

**Prediction**

In [None]:
def prediction(X_train, y_train, X_test, y_test, estimator):
        
    print (": Metrics of estimator: ", estimator, "\n")
    
    pipe = Pipeline([('preprocessor', preprocessor), ('estimator', estimator)], verbose=True)
    pipe.fit(X_train, y_train.ravel())
    pred = pipe.predict(X_test)
    
    rmse = mean_squared_error(y_test, pred, squared = False)
    print("RMSE: %.4f" % rmse)
    evs = explained_variance_score(y_test, pred)
    print("EVS: %.4f" % evs)
    score = pipe.score(X_test, y_test)
    print("Score: %.4f" % score)

    MAX = 200
    x = range(len(pred))[0:MAX]
    plt.scatter(x,y_test[0:MAX],color='g', linewidths=5, label='Actual')
    plt.scatter(x,pred[0:MAX],color='r', linewidths=3, label='Predicted')
    plt.plot(pred[0:MAX], color = 'r', linewidth=2)
    plt.ylabel('Sale Price', fontsize=15)
    plt.title(estimator, fontsize=20)
    plt.legend(loc ="upper right", fontsize=20)
    plt.show()
        
    return pipe

**Ridge**

In [None]:
ridge = prediction(X_train, y_train, X_test, y_test, Ridge(alpha=16))

**Ridge CV**

In [None]:
ridgecv = prediction(X_train, y_train, X_test, y_test, RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10]))

**RandomForest**

In [None]:
randomforest = prediction(X_train, y_train, X_test, y_test, RandomForestRegressor(n_estimators=800, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=50, bootstrap=False, random_state=2, n_jobs=-1))

**XGBoost**

In [None]:
xgb = prediction(X_train, y_train, X_test, y_test, XGBRegressor(n_jobs=-1))

**Gradient Boost**

In [None]:
gradientboost = prediction(X_train, y_train, X_test, y_test, GradientBoostingRegressor(n_estimators=400,learning_rate=0.1,max_depth=2,subsample=1,random_state=1))

**Lasso CV**

In [None]:
lassocv = prediction(X_train, y_train, X_test, y_test, LassoCV(cv=10, n_jobs=-1))

**StackingRegressor**

In [None]:
estimators = [('ridge', ridge), ('ridgecv', ridgecv), ('lassocv', lassocv), ('randomforest', randomforest), ('xgb', xgb), ('gradientboost', gradientboost)]
sr = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=800, random_state=42, n_jobs=-1), n_jobs=-1, cv = 10)

sr.fit(X_train, y_train.ravel())
pred = sr.predict(X_test)

rmse = mean_squared_error(y_test, pred, squared = False)
print("RMSE: %.4f" % rmse)
evs = explained_variance_score(y_test, pred)
print("EVS: %.4f" % evs)
score = sr.score(X_test, y_test)
print("Score: %.4f" % score)

MAX = 200
x = range(len(pred))[0:MAX]
plt.scatter(x,y_test[0:MAX],color='g', linewidths=5, label='Actual')
plt.scatter(x,pred[0:MAX],color='r', linewidths=3, label='Predicted')
plt.plot(pred[0:MAX], color = 'r', linewidth=2)
plt.ylabel('Sale Price', fontsize=15)
plt.title(sr, fontsize=20)
plt.legend(loc ="upper right", fontsize=20)
plt.show()

In [None]:
sr

**VotingRegressor**

In [None]:
estimators = [('ridge', ridge), ('ridgecv', ridgecv), ('lassocv', lassocv), ('stacked', sr), ('randomforest', randomforest), ('xgboost', xgb), ('gradientboost', gradientboost)]
# vr = VotingRegressor(estimators=estimators, weights=[Weights['ridge'], Weights['ridgecv'], Weights['stacked'], Weights['randomforest'], Weights['xgboost'], Weights['gradientboost']], n_jobs=-1)
vr = VotingRegressor(estimators=estimators, n_jobs=-1) # Gave better result than above

vr.fit(X_train, y_train.ravel())
pred = vr.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)
print("RMSE: %.4f" % rmse)
evs = explained_variance_score(y_test, pred)
print("EVS: %.4f" % evs)
score = vr.score(X_test, y_test)
print("Score: %.4f" % score)

MAX = 200
x = range(len(pred))[0:MAX]
plt.scatter(x,y_test[0:MAX],color='g', linewidths=5, label='Actual')
plt.scatter(x,pred[0:MAX],color='r', linewidths=3, label='Predicted')
plt.plot(pred[0:MAX], color = 'r', linewidth=2)
plt.ylabel('Sale Price', fontsize=15)
plt.title(vr, fontsize=20)
plt.legend(loc ="upper right", fontsize=20)
plt.show()

In [None]:
vr

In [None]:
data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
id_values = data.Id.values

In [None]:
data['MasVnrType'] = data['MasVnrType'].replace(np.nan, data.MasVnrType.value_counts().index[0])
data['Electrical'] = data['Electrical'].replace(np.nan, data.Electrical.value_counts().index[0])

In [None]:
pred = vr.predict(data.drop(['Id'], axis=1))
pred = ScaleTarget.inverse_transform(pred.reshape(-1,1))

In [None]:
result = pd.DataFrame(id_values, columns=['Id'])

In [None]:
result['SalePrice'] = pred

In [None]:
result

In [None]:
result.to_csv("Result_p.csv", index=False)