### https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=test.csv

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sb
import sidetable
%matplotlib inline

df_train = pd.read_csv('./train.csv', index_col='Id')
X_train = df_train.drop(columns='SalePrice')
y_train = df_train['SalePrice']
X_test = pd.read_csv('./test.csv', index_col='Id')
train_test = pd.concat([X_train, X_test])
train_index = X_train.index
test_index = X_test.index

In [2]:
# Have a look at the first five rows
train_test.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [3]:
threshold = 0.3 * len(train_test)
cols = train_test.isnull().sum()[train_test.isnull().sum() < threshold].index
train_test = train_test[cols]
X_train = train_test.loc[train_index, cols]
X_test = train_test.loc[test_index, cols]

In [4]:
from sklearn.impute import SimpleImputer
from category_encoders.cat_boost import CatBoostEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

num_cols = train_test.select_dtypes(include='number').columns
cat_cols = train_test.select_dtypes(exclude='number').columns
si = SimpleImputer(strategy='most_frequent', fill_value='Unknown')
ce = CatBoostEncoder()
ii = IterativeImputer()
sc = StandardScaler()

X_train.drop_duplicates(inplace=True)
X_test.drop_duplicates(inplace=True)

si.fit(train_test[cat_cols])
X_train[cat_cols] = si.transform(X_train[cat_cols])
X_test[cat_cols] = si.transform(X_test[cat_cols])

ce.fit(X_train[cat_cols], y_train)
X_train[cat_cols] = ce.transform(X_train[cat_cols])
X_test[cat_cols] = ce.transform(X_test[cat_cols])

ii.fit(train_test[num_cols])
X_train[num_cols] = ii.transform(X_train[num_cols])
X_test[num_cols] = ii.transform(X_test[num_cols])

sc.fit(train_test[num_cols])
X_train[num_cols] = sc.transform(X_train[num_cols])
X_test[num_cols] = sc.transform(X_test[num_cols])

In [5]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold

selector = RFECV(RandomForestRegressor(), cv=RepeatedKFold(10), scoring='neg_mean_squared_error')
selector = selector.fit(X_train, y_train)
selector.support_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

KeyboardInterrupt: 

In [None]:
cols = X_train.columns[selector.support_]
X_train = X_train[cols]
X_test = X_test[cols]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1)

In [None]:
from tune_sklearn import TuneSearchCV
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

def objective(trial):
    
    param = {
        'max_depth': trial.suggest_int('max_depth', 10,50),
         'num_leaves': trial.suggest_int('num_leaves', 6, 50), 
         'learning_rate': trial.suggest_categorical('learning_rate', [0.1,0.01,0.001]),
         'min_child_samples': trial.suggest_int('min_child_samples', 100, 500), 
         'min_child_weight': trial.suggest_categorical('min_child_weight', [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]),
         'subsample': trial.suggest_loguniform('subsample', 0.2, 0.8), 
         'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.4, 0.6),
         'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]),
         'reg_lambda': trial.suggest_categorical('reg_lambda', [0, 1e-1, 1, 5, 10, 20, 50, 100])
        }
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', n_jobs=-1, cv=RepeatedKFold(10)).mean()

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

In [None]:
from yellowbrick.model_selection import learning_curve

# Build model with the best params
best = XGBRegressor(**study.best_params )

In [None]:
best.fit(X_train, y_train)
y_pred = best.predict(X_test)
y_pred = y_pred.astype(int)
y_pred = pd.DataFrame(y_pred, columns=['Survived'], index=X_test.index)
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 