# imports for excercise

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor,StackingRegressor,VotingRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,make_scorer
from sklearn.model_selection import GridSearchCV,KFold,train_test_split,RepeatedKFold,cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import scipy.stats
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline

pd.set_option('max_rows', 90)


# * **#import train and test data**

In [None]:
house_test= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
house_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
target=house_train['SalePrice']
test_ids=house_test["Id"]
house_train.head()

# **# combine train+test sets**

In [None]:
data1 = pd.concat([house_train, house_test], axis=0).reset_index(drop=True)
data1

# drop id(unique) +target variable we want to predict

In [None]:
data1.drop(['Id','SalePrice'],axis=1,inplace=True)

In [None]:
data2 = data1.copy()

# **#check missing val******

In [None]:
data2.isnull().sum()

# Visualize missing values

In [None]:
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')
missing = data2.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(color="b")
ax.xaxis.grid(False)
ax.set(ylabel="missing values")
ax.set(xlabel="Features")
ax.set(title="missing data by feature")
sns.despine(trim=True, left=True)

In [None]:
data2.info()

# right data type for months-this isnt ordinal feature


In [None]:
data2['MSSubClass'] = data2['MSSubClass'].astype(str)

# lets visualize the features

In [None]:
# Finding numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in data2.columns:
    if data2[i].dtype in numeric_dtypes:
        if i in ['TotalSF', 'Total_Bathrooms','Total_porch_sf','haspool','hasgarage','hasbsmt','hasfireplace']:
            pass
        else:
            numeric.append(i)     
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(12, 120))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(data2[numeric]),1):
    if(feature=='MiscVal'):
        break
    plt.subplot(len(list(numeric)), 3, i)
    sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=house_train)
        
    plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
    plt.ylabel('SalePrice', size=15, labelpad=12.5)
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(loc='best', prop={'size': 10})
        
plt.show()

# **CLEANING**

# **fill categorial missing values**

In [None]:
# when the nun is meaningfull so we will put none instead- exp. alley N/A means no alley
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]:
    data2[column] = data2[column].fillna("None")

# when the nun is meaningless
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

In [None]:
data3 = data2.copy()

In [None]:
data3.isnull().sum().sum()

# # filling missing numeric values using knn function

In [None]:
def knn_impute(df, na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number)
    non_na_columns = numeric_df.loc[: ,numeric_df.isna().sum() == 0].columns
    
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [None]:
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
]:
    data3 = knn_impute(data3, column)

In [None]:
data4 = data3.copy()

In [None]:
data4.isnull().sum().sum()

In [None]:
data5=data4.copy()

# feature engineering

In [None]:
data5["SqFtPerRoom"] = data5["GrLivArea"] / (data5["TotRmsAbvGrd"] +
                                                       data5["FullBath"] +
                                                       data5["HalfBath"] +
                                                       data5["KitchenAbvGr"])

data5['Total_Home_Quality'] = data4['OverallQual'] + data5['OverallCond']

data5['Total_Bathrooms'] = (data5['FullBath'] + (0.5 * data5['HalfBath']) +
                               data5['BsmtFullBath'] + (0.5 * data5['BsmtHalfBath']))

data5["HighQualSF"] = data5["1stFlrSF"] + data5["2ndFlrSF"]

# transformations for features

# check for Numeric Skewed Features- 0.8 threshold to them apply Log Transform for normal distribution 

In [None]:
skew_df = pd.DataFrame(data5.select_dtypes(np.number).columns, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data5[feature]))
skew_df['Skewed'] = skew_df['Skew'].apply(lambda x: True if abs(x) >= 0.8 else False)
skew_df

# we will use log(x+1) beacuse log(x) isnt defined in zero

In [None]:
for column in skew_df.query("Skewed == True")['Feature'].values:
    data5[column] = np.log1p(data5[column])

# our goal is to predict target  - lets see it 


In [None]:
f, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')
sns.distplot(target, color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
target_skw =  scipy.stats.skew(target)
target_skw

# #we can see that the target is also highly skewed so we will also use log tranform

In [None]:
lg_target=np.log(target)

# lets plot now target 

In [None]:
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(lg_target , color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True)

plt.show()

# we will use cos(x) in order to handle the months-beacuse its a cycle

In [None]:
data5['MoSold'] = (-np.cos(0.5236 * data5['MoSold']))

# use dummy variables to encode the categoril variables

In [None]:
data5 = pd.get_dummies(data5)
data5

In [None]:
data6=data5.copy()

# scalling 

In [None]:
scaler = StandardScaler()
scaler.fit(data6)

data6 = pd.DataFrame(scaler.transform(data6), index=data6.index, columns=data6.columns)

In [None]:
data6

# Spliting the data to the original test and train 

In [None]:
train_f = data6.loc[:house_train.index.max(), :].copy()
test_f = data6.loc[house_train.index.max() + 1:, :].reset_index(drop=True).copy()

In [None]:
train_f

In [None]:
test_f

In [None]:
class ModelTuner:
    def __init__(self, models, X, y, cv, loss):
        self.init_models = models.copy()
        self.models = models.copy()
        self.X_train = train_f.copy()
        self.y_train = lg_target.copy()
        self.cv = cv
        self.loss = loss
        self.train_scores = {}
        self.valid_scores = {}
        
    def tune_model(self, model_name, grid, verbose=1, n_jobs=-1):
        self.models[model_name] = self.init_models[model_name]
        search = GridSearchCV(
            self.models[model_name],
            param_grid=grid,
            scoring=self.loss,
            cv=self.cv,
            return_train_score=True,
            verbose=verbose,
            n_jobs=n_jobs,
        )
        search.fit(self.X_train, self.y_train)
        self.models[model_name] = search.best_estimator_
        
        rename_metric = {
            "mean_train_score": "mean_train_RMSE",
            "mean_test_score": "mean_valid_RMSE",
        }        
        
        cv_results = pd.DataFrame(search.cv_results_)
        columns = [f"param_{param}" for param, val in grid.items() if len(val)>1]
        columns.extend(rename_metric.keys())
        cv_results = cv_results[columns].rename(columns=rename_metric)
        cv_results[list(rename_metric.values())] = -cv_results[list(rename_metric.values())]
        cv_results = cv_results.sort_values(by="mean_valid_RMSE", ignore_index=True)

        train_score = cv_results.mean_train_RMSE.iloc[0]
        valid_score = cv_results.mean_valid_RMSE.iloc[0]
        self.train_scores[model_name] = train_score
        self.valid_scores[model_name] = valid_score
        
        print("="*40)
        print(f"Model: {search.best_estimator_}")
        print(f"Train RMSE: {train_score}")
        print(f"Valid RMSE: {valid_score}")
        print("="*40)

        return cv_results
    
    def collate_results(self):
        train_results = pd.DataFrame.from_dict(
            self.train_scores,
            orient="index",
            columns=["mean_train_RMSE"],
        )
        valid_results = pd.DataFrame.from_dict(
            self.valid_scores,
            orient="index",
            columns=["mean_valid_RMSE"],
        )
        results = pd.concat(
            [train_results, valid_results],
            axis=1,
        ).sort_values(by="mean_valid_RMSE")
        
        return results
    
    def get_models(self, model_names=None):
        if not model_names:
            model_names = self.models.keys()
            
        models = [(name, self.models[name]) for name in model_names]
        
        return models
    
    def run_cv(self, model, n_jobs=-1):
        model = self.models[model] if model in self.models else model
        scores = cross_validate(
            model,
            X=self.X_train,
            y=self.y_train,
            cv=self.cv,
            return_train_score=True,
            scoring=self.loss,
            n_jobs=n_jobs,
        )
        train_scores = scores["train_score"]
        valid_scores = scores["test_score"]
        
        print(f"Model: {model}")
        print(f"Train RMSE: {-np.mean(train_scores)}")
        print(f"Valid RMSE: {-np.mean(valid_scores)}")

    
    def model_predict(self, model, X_test):
        model = self.models[model] if model in self.models else model
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame({
            "Id": range(1461, 1461+len(y_pred)),
            "SalePrice": np.expm1(y_pred),
        })

        return y_pred
    
    def perf_boxplot(self, add_models=None, cv=None, n_jobs=-1):
        models = self.models.copy()
        if add_models:
            for name, model in add_models:
                models[name] = model
            
        perf_df = pd.DataFrame()
        for name, model in models.items():
            valid_scores = cross_validate(
                model,
                X=self.X_train,
                y=self.y_train,
                cv=cv if cv else self.cv,
                scoring=self.loss,
                n_jobs=n_jobs,
            )["test_score"]
            
            model_perf = pd.DataFrame({"Model": name, "Validation RMSE": -valid_scores})
            perf_df = pd.concat([perf_df, model_perf])
        
        min_median = perf_df.groupby("Model").median().min()[0]
        
        plt.figure(figsize=(18, 8))
        
        sns.boxplot(
            x=perf_df["Validation RMSE"],
            y=perf_df["Model"],
            flierprops={"alpha": 0.5},
        )
        plt.axvline(x=min_median, ls="--", color="coral")
        plt.ylabel("")

In [None]:
SEED=0
kfold = KFold(n_splits=10, shuffle=True, random_state=SEED)
rmse_loss = make_scorer(mean_squared_error, squared=False, greater_is_better=False)

MODELS = {
    "Ridge": Ridge(random_state=SEED),
    "Kernel SVR": SVR(),
    "Random Forest": RandomForestRegressor(random_state=SEED),
    "Gradient Boosting": GradientBoostingRegressor(random_state=SEED),
}

tuner = ModelTuner(MODELS, X=train_f, y=lg_target, cv=kfold, loss=rmse_loss)

# RIGDE

In [None]:
RR_PARAM = {
    "alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1., 1e1],
}

rr_results = tuner.tune_model("Ridge", grid=RR_PARAM)

rr_results

# Kernel SVR.
# 

In [None]:
KSVR_PARAM = {
    "C": [1e-4, 1e-3, 1e-2, 1e-1, 1., 1e1],
    "epsilon": [0., 1e-2, 1e-1],
    "gamma": [1e-4, 1e-3, 1e-2, 1e-1],
    "kernel": ["poly", "rbf"],
}

ksvr_results = tuner.tune_model("Kernel SVR", grid=KSVR_PARAM)

ksvr_results.head()

# Random Forest

In [None]:
RF_PARAM = {
    "max_depth": [8, 16, 32, 64],
    "n_estimators": [100, 200, 400],
    "max_features": ["sqrt"],
    "bootstrap": [False],
}

rf_results = tuner.tune_model("Random Forest", grid=RF_PARAM)

rf_results

# Gradient Boosting
# 

In [None]:
GB_PARAM = {
    "max_depth": [3, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 4],
    "learning_rate": [1e-2, 1e-1],
    "n_estimators": [3200],
    "max_features": ["sqrt"],
}

gb_results = tuner.tune_model("Gradient Boosting", grid=GB_PARAM)

gb_results

In [None]:
odels_selected = ["Ridge", "Kernel SVR",'Random Forest', "Gradient Boosting"]
indv_models = tuner.get_models(model_names=MODELS)
ensemble = VotingRegressor(indv_models)

tuner.run_cv(ensemble)

In [None]:
rkfold = RepeatedKFold(n_splits=10, n_repeats=10, random_state=SEED)

tuner.perf_boxplot(add_models=[("Ensemble Average", ensemble)], cv=rkfold)

In [None]:
submission = tuner.model_predict(ensemble, test_f)

submission.head()

In [None]:
submission.to_csv('./submission.csv', index=False, header=True)
