In [1]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tqdm as notebook_tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder, PowerTransformer
from sklearn.model_selection import GridSearchCV, KFold

# for model learning 
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
#from category_encoders.cat_boost import CatBoostEncoder
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBRegressor

# Metrics
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error, mean_squared_error

# tunning hyperparamters model
import optuna

from sklearn.feature_selection import mutual_info_regression


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
train = pd.read_csv('data/train.csv', index_col="Id")
test = pd.read_csv('data/test.csv', index_col="Id")

In [3]:
X_train, y = train.drop('SalePrice', axis=1), train['SalePrice']

In [4]:
all_data = pd.concat([X_train, test])
y = np.log1p(y)

In [5]:
missing_val_count_by_column = (all_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType      1766
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64


In [6]:
all_data.drop(['Functional', 'LandSlope', 'RoofMatl', 'Condition2', 'RoofStyle', 'Heating', 'LotConfig'], axis=1, inplace=True)

In [7]:
all_data.drop(['LowQualFinSF', 'BsmtFinSF2', 'ScreenPorch', 'EnclosedPorch', 'MasVnrArea'], axis=1, inplace=True)

In [8]:
all_data.drop(['MoSold', 'Street', 'Utilities', 'MiscVal', 'PoolArea', 'PoolQC', '3SsnPorch', 'MiscFeature'], axis=1, inplace=True)

In [9]:
#NA_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

#all_data[NA_cols].isna().sum()

In [10]:
#all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
#all_data['YrSold'] = all_data['YrSold'].astype(str)
#all_data['MoSold'] = all_data['MoSold'].astype(str)
#all_data['GarageYrBlt'] = all_data['GarageYrBlt'].astype(str)

In [11]:
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
#all_data['MoSold'] = all_data['MoSold'].astype(str)
all_data['YearBuilt'] = all_data['YearBuilt'].astype(str)
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].astype(str)
all_data['OverallQual'] = all_data['OverallQual'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)

In [12]:
#for cols in numerical_cols:
#    miss = all_data[cols].isna().sum()
#    if miss > 0:
#        print(f'{cols}: {miss}')

In [13]:
categorical_cols = [col_name for col_name in all_data.columns if all_data[col_name].dtype == 'object']
numerical_cols = [col_name for col_name in all_data.columns if all_data[col_name].dtype != 'object']

In [14]:
imputer = ColumnTransformer(
    transformers = [
        ("num_imputer", SimpleImputer(strategy="constant", fill_value=0), numerical_cols),
        ("cat_imputer", SimpleImputer(strategy="constant", fill_value='NA'), categorical_cols)
    ],
    verbose_feature_names_out = False,
    remainder = "passthrough"
)

In [15]:
all_data = imputer.fit_transform(all_data)

In [16]:
len(all_data.columns)

59

In [17]:
# New features
all_data['Average_Qual'] = all_data['OverallQual'].astype(int) + all_data['OverallCond'].astype(int)
#drop
all_data.drop(['OverallQual', 'OverallCond'], axis=1, inplace=True)

all_data['CompAge'] = all_data['YrSold'].astype(int) - all_data['YearBuilt'].astype(int)
all_data['ReNew'] = all_data['YearRemodAdd'].astype(int) - all_data['YearBuilt'].astype(int)
#drop
all_data.drop(['YrSold', 'YearBuilt', 'YearRemodAdd'], axis=1, inplace=True)

all_data['Baths'] = all_data['BsmtFullBath'] + all_data['FullBath'] + (0.5 * all_data['HalfBath']) + (0.5 * all_data['BsmtHalfBath'])
#drop
all_data.drop(['BsmtFullBath', 'FullBath', 'HalfBath', 'BsmtHalfBath'], axis=1, inplace=True)

all_data['HQArea'] = all_data['GrLivArea'] + all_data['1stFlrSF'] + all_data['2ndFlrSF'] + 0.5 * all_data['GarageArea'] + 0.5 * all_data['TotalBsmtSF']
#drop
all_data.drop(['GrLivArea', '1stFlrSF', '2ndFlrSF', 'GarageArea', 'TotalBsmtSF'], axis=1, inplace=True)

In [18]:
categorical_cols = [col_name for col_name in all_data.columns if all_data[col_name].dtype == 'object']
numerical_cols = [col_name for col_name in all_data.columns if all_data[col_name].dtype != 'object']
# columns for encoding
ordinal_low_encoding_cols = [col_name for col_name in all_data[categorical_cols] if all_data[col_name].nunique() < 3]
ordinal_high_encoding_cols = [col_name for col_name in all_data[categorical_cols] if all_data[col_name].nunique() > 9]
one_hot_encoding_cols = [col_name for col_name in all_data[categorical_cols] if all_data[col_name].nunique() > 2
                         and all_data[col_name].nunique() < 10]

# columns for scaling
scaler_cols = [col_name for col_name in all_data[numerical_cols] if (all_data[col_name] > 10).any()]

In [19]:
# encoders
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)

#scaler
s_scaler = StandardScaler()

# pipeline for encoder and scaler
encoder_scaler = ColumnTransformer(
    [
        ("ordinal_encoding", ordinal_encoder, ordinal_low_encoding_cols + ordinal_high_encoding_cols),
        ("one_hot_encoding", one_hot_encoder, one_hot_encoding_cols),
        ("scaling_num_cols", s_scaler, scaler_cols)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)

In [20]:
# encoders
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)

#scaler
s_scaler = StandardScaler()

# pipeline for encoder and scaler
scaler = ColumnTransformer(
    [
        #("ordinal_encoding", ordinal_encoder, categorical_cols),#ordinal_low_encoding_cols + ordinal_high_encoding_cols),
        #("one_hot_encoding", one_hot_encoder, one_hot_encoding_cols),
        ("scaling_num_cols", s_scaler, scaler_cols)
    ],
    verbose_feature_names_out = False
)

In [21]:
# encoders
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)

#scaler
s_scaler = StandardScaler()

# pipeline for encoder and scaler
encoder = ColumnTransformer(
    [
        ("ordinal_encoding", ordinal_encoder, categorical_cols),#ordinal_low_encoding_cols + ordinal_high_encoding_cols),
        #("one_hot_encoding", one_hot_encoder, one_hot_encoding_cols),
        #("scaling_num_cols", s_scaler, scaler_cols)
    ],
    verbose_feature_names_out = False
)

In [22]:
all_data = encoder_scaler.fit_transform(all_data)

In [23]:
encoder_scaler_ord = ColumnTransformer(
    [
        ("scaling_enc_cols", s_scaler, categorical_cols)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)

In [24]:
#all_data = encoder_scaler_ord.fit_transform(all_data)

In [25]:
X, test = all_data.iloc[:y.shape[0]], all_data.iloc[y.shape[0]:]

In [26]:
X.shape, test.shape

((1460, 169), (1459, 169))

In [27]:
X.columns

Index(['CentralAir', 'MSSubClass', 'Neighborhood', 'Exterior1st',
       'Exterior2nd', 'GarageYrBlt', 'SaleType', 'MSZoning_C (all)',
       'MSZoning_FV', 'MSZoning_NA',
       ...
       'OpenPorchSF', 'Average_Qual', 'CompAge', 'ReNew', 'HQArea',
       'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'Baths'],
      dtype='object', length=169)

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
rfc = RandomForestRegressor(n_estimators=250, random_state=1)

rfc.fit(X_train, y_train)

y_train_rfc = rfc.predict(X_train)
y_valid_rfc = rfc.predict(X_valid)

In [30]:
display(root_mean_squared_error((y_train), (y_train_rfc)))
display(root_mean_squared_error(y_valid, (y_valid_rfc)))

0.04968984582063331

0.14609079045982418

In [31]:
xgbc = XGBRegressor(n_estimators=2000, learning_rate=0.05, device="cuda")

xgbc.fit(X_train, y_train)

y_train_xgbc = xgbc.predict(X_train)
y_valid_xgbc = xgbc.predict(X_valid)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [32]:
display(root_mean_squared_error((y_train), (y_train_xgbc)))
display(root_mean_squared_error((y_valid), (y_valid_xgbc)))

0.0018864627286890661

0.1411553257490111

In [33]:
best_params = {'max_leaves': 8,
          'depth': 3,
          'od_wait': 200,
          'l2_leaf_reg': 3,
          'iterations': 200000,
          'model_size_reg': 0.7,
          'learning_rate': 0.05,
          'random_seed': 42 }
final_model = CatBoostRegressor(**best_params)

In [35]:
final_model.fit(X, y_train, eval_set=(X_valid, y_valid), verbose=False)

<catboost.core.CatBoostRegressor at 0x76466758e480>

In [36]:
final_pred = final_model.predict(X_valid)

In [37]:
def dumb_test_():
    
    for i in range(0, len(test), 100):
        output = all_data.iloc[y.shape[0]:]
        tset_pred = final_model.predict(dimb_test)
        output = pd.DataFrame({'Id': dimb_test.index, 'SalePrice': tset_pred})
        X_dumb_test = pd.concat([X_train, dimb_test])

In [38]:
dimb_test = test

In [39]:
tset_pred = final_model.predict(dimb_test)

In [40]:
output = pd.DataFrame({'Id': dimb_test.index, 'SalePrice': tset_pred})

In [41]:
X_dumb_test = pd.concat([X_train, dimb_test])

In [42]:
output.set_index('Id', inplace=True)
output = output["SalePrice"]

In [43]:
y_dumb_test = pd.concat([y_train, output])

In [44]:
final_model.fit(X_dumb_test, y_dumb_test, eval_set=(X_valid, y_valid), verbose=False)

<catboost.core.CatBoostRegressor at 0x76466758e480>

In [45]:
final_pred = final_model.predict(X_valid)

In [46]:
display(root_mean_squared_error((y_train), (y_train_xgbc)))
display(root_mean_squared_error((y_valid), (final_pred)))

0.0018864627286890661

0.13043341392340324

In [48]:
preds = final_model.predict(test)
output = pd.DataFrame({'Id': test.index,
                       'SalePrice': np.exp(preds)})
output.to_csv('submission_dumb.csv', index=False)

In [None]:
#final_score = rmse(y_valid, final_pred)
#final_score

In [None]:
pd.set_option('display.max_rows', 250)
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
print(mi_scores)

In [None]:
HQArea           0.641368
Neighborhood     0.501150
GrLivArea        0.458812
GarageCars       0.372359
GarageArea       0.365512
TotalBsmtSF      0.358823
Average_Qual     0.358722
Baths            0.344430
CompAge          0.333822
ExterQual        0.330519
KitchenQual      0.328393
BsmtQual         0.326441
GarageYrBlt      0.299632
1stFlrSF         0.298508
MSSubClass       0.281000
GarageFinish     0.265372
TotRmsAbvGrd     0.217212
LotFrontage      0.208478
GarageType       0.207147
FireplaceQu      0.205385
2ndFlrSF         0.200384
Foundation       0.198323
LotArea          0.167144
HeatingQC        0.165700
Fireplaces       0.165019
Exterior2nd      0.163077
OpenPorchSF      0.162937
BsmtFinType1     0.151554
BsmtFinSF1       0.143730
Exterior1st      0.132918
MSZoning         0.129958
BsmtUnfSF        0.126058
MasVnrType       0.104167
WoodDeckSF       0.102232
MasVnrArea       0.096051
LotShape         0.090287
HouseStyle       0.085631
ReNew            0.083815
BsmtExposure     0.078226
GarageCond       0.076158
SaleCondition    0.075143
SaleType         0.073050
BedroomAbvGr     0.068213
GarageQual       0.068207
CentralAir       0.064137
Electrical       0.052449
PavedDrive       0.051687
BsmtCond         0.048573
BldgType         0.043102
Fence            0.039260
LandContour      0.026813
BsmtFinType2     0.026585
EnclosedPorch    0.026282
Alley            0.026269
KitchenAbvGr     0.020910
LotConfig        0.017051
ScreenPorch      0.016916
ExterCond        0.016535
Condition1       0.014328
Heating          0.012645
Functional       0.008974
RoofStyle        0.007753
RoofMatl         0.006883
BsmtFinSF2       0.005521
LowQualFinSF     0.005492
LandSlope        0.001887
Street           0.000534
Condition2       0.000484
3SsnPorch        0.000000
PoolArea         0.000000
MiscVal          0.000000
MiscFeature      0.000000
Utilities        0.000000
PoolQC           0.000000
MoSold           0.000000
