In [349]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
plt.rcParams['figure.dpi'] = 100
plt.style.use('ggplot')
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost

In [350]:
df = pd.read_csv("data/train.csv")
test = pd.read_csv('data/test.csv')
subm_df=pd.concat([df,test],axis=0)
y_train=df['SalePrice']

In [351]:
df.shape, test.shape, subm_df.shape

((1460, 81), (1459, 80), (2919, 81))

In [352]:
subm_df.drop('SalePrice', axis=1, inplace=True)

In [353]:
subm_df.shape

(2919, 80)

In [354]:
subm_cols = subm_df.columns.to_list()
subm_cols[40:]
# len(subm_cols)

['HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [355]:
subm_df.shape

(2919, 80)

In [356]:
def col_analytics(col):
    print(f'Число NaN: {subm_df[col].isna().sum()}')
    print(f'Число уникальный значений: {subm_df[col].nunique()}')
    print(subm_df[col].value_counts())
    sns.histplot(subm_df[col])

#### Форматирование колонок

In [357]:
# col_analytics('FireplaceQu')

In [358]:
col_2_encoding = ['GarageType', 'GarageFinish', 'MoSold', 'YrSold']
# col_2_reshape_2encoding = ['Fireplaces', 'FireplaceQu', 'HeatingQC', 'BsmtFullBath', 'KitchenQual', 
# 'GarageCars', 'PavedDrive', 'OpenPorchSF', 'WoodDeckSF', 'HalfBath'] # done
col_2_normalize = ['FullBath', '1stFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
                   'BedroomAbvGr']
col_2_delete = ['LowQualFinSF', '2ndFlrSF', 'BsmtHalfBath', 'Functional', 'WoodDeckSF', 
                'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'CentralAir', 'Electrical', 'GarageQual', 
                'EnclosedPorch','3SsnPorch', 'ScreenPorch', 'KitchenAbvGr', 'SaleType']

In [359]:
# Перекодирование колонки 'Fireplaces' на 1 или 0
subm_df['Fireplaces'] = subm_df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
# Перекодирование колонки 'FireplaceQu' на 1 или 0
subm_df['FireplaceQu'] = subm_df['FireplaceQu'].apply(lambda x: 1 if x in ['Gd', 'Ex'] else 0)
# Перекодирование колонки 'HeatingQC' на 1 или 0
subm_df['HeatingQC'] = subm_df['HeatingQC'].apply(lambda x: 1 if x in ['Ex'] else 0)
# Перекодирование колонки 'HeatingQC' на 1 или 0
subm_df['BsmtFullBath'] = subm_df['BsmtFullBath'].apply(lambda x: 1 if x in [1, 2] else 0)
# Перекодирование колонки 'KitchenQual' на 1 или 0
subm_df['KitchenQual'] = subm_df['KitchenQual'].apply(lambda x: 1 if x in ['Gd', 'Ex'] else 0)


# Оставить только значения 'Attchd' и 'Detchd', остальные заменить на 'Other'
garage_mapping = {'Attchd': 'Attchd', 'Detchd': 'Detchd', 'BuiltIn': 'Other', 'Basment': 'Other',
    '2Types': 'Other', 'CarPort': 'Other'}
subm_df['GarageType'] = subm_df['GarageType'].map(garage_mapping)

# Перекодирование колонки 'GarageCars' на 1 или 0
subm_df['GarageCars'] = subm_df['GarageCars'].apply(lambda x: 1 if x == 1 else 2 if x == 2 else 3 if x > 2 else 0)
# Перекодирование колонки 'OpenPorchSF' на 1 или 0
subm_df['OpenPorchSF'] = subm_df['OpenPorchSF'].apply(lambda x: 0 if x == 0 else 1)
# Перекодирование колонки 'WoodDeckSF' на 1 или 0
subm_df['WoodDeckSF'] = subm_df['WoodDeckSF'].apply(lambda x: 0 if x == 0 else 1)


In [360]:
# удаляем колонки
subm_df = subm_df.drop(columns=col_2_delete)
# one-hot encoding
subm_df = pd.get_dummies(subm_df, columns=col_2_encoding)

# Применяем StandartScaler: после его применения у всех колонок среднее станет равно 0, стандартное отклонение 1
ss_scaler = StandardScaler()
subm_df[col_2_normalize] = ss_scaler.fit_transform(subm_df[col_2_normalize])

In [361]:
bsmt_qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
subm_df['BsmtQual'] = subm_df['BsmtQual'].map(bsmt_qual_mapping)
exter_qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
subm_df['ExterQual'] = subm_df['ExterQual'].map(exter_qual_mapping)
exter_cond_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
subm_df['ExterCond'] = subm_df['ExterCond'].map(exter_cond_mapping)
heating_qc_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
subm_df['HeatingQC'] = subm_df['HeatingQC'].map(heating_qc_mapping)
subm_df['LotFrontage']=subm_df['LotFrontage'].fillna(subm_df['LotFrontage'].mean())
subm_df['BsmtQual']=subm_df['BsmtQual'].fillna(subm_df['BsmtQual'].mode()[0])
subm_df.drop(['Street', 'Alley', 'Utilities', 'LandSlope', 'Condition2', 'RoofMatl', 'Neighborhood', 'Heating'], axis=1, inplace=True)


columns_to_transform = ['MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Condition1', 'BldgType',
                        'RoofStyle', 'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtFinSF2']
for col in columns_to_transform:
    most_frequent_value = subm_df[col].mode()[0]
    subm_df[col] = subm_df[col].apply(lambda x: 1 if x == most_frequent_value else 0)



columns_to_normalize = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
                        'YearBuilt', 'YearRemodAdd', 'ExterQual', 'ExterCond', 'BsmtFinSF1',
                        'BsmtUnfSF', 'TotalBsmtSF','BsmtQual','HeatingQC']

for col in columns_to_normalize:
    subm_df[col] = (subm_df[col] - subm_df[col].min()) / (subm_df[col].max() - subm_df[col].min())

columns_to_transform = ['HouseStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1']

for column in columns_to_transform:
    value_counts = df[column].value_counts()
    most_popular_value = value_counts.index[0]
    second_popular_value = value_counts.index[1]
    subm_df[column] = subm_df[column].apply(lambda x: 2 if x == most_popular_value else (1 if x == second_popular_value else 0))

In [362]:
subm_df.shape

(2919, 73)

In [363]:
X_train=subm_df.iloc[:1460,:]
X_test=subm_df.iloc[1460:,:]
# df_Test.drop(['SalePrice'],axis=1,inplace=True)
# X_train=df_Train.drop(['SalePrice'],axis=1)
# y_train=df_Train['SalePrice']

In [364]:
X_train.shape, X_test.shape

((1460, 73), (1459, 73))

### Модель

In [365]:
regressor=xgboost.XGBRegressor()
booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]

In [366]:
n_estimators = [100, 200, 300]
max_depth = [2, 3, 5, 7]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
# min_child_weight=[1,2,3,4]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    # 'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [367]:
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_squared_error',
            n_jobs = -1,
            verbose = 5,
            return_train_score = True,
            random_state=42)

In [379]:
X_train.dtypes

Id               int64
MSSubClass     float64
MSZoning         int64
LotFrontage    float64
LotArea        float64
                ...   
YrSold_2006       bool
YrSold_2007       bool
YrSold_2008       bool
YrSold_2009       bool
YrSold_2010       bool
Length: 73, dtype: object

In [383]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('O'), dtype('bool')],
      dtype=object)

In [386]:
# надо разобраться, так делать нельзя
X_train = X_train.select_dtypes(include = ['float64', 'int64', 'bool'])

In [387]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "max_depth" } are not used.

[CV 2/5] END base_score=0.75, booster=gblinear, learning_rate=0.15, max_depth=3, n_estimators=200;, score=(train=-1045539959.217, test=-1189062945.406) total time=   1.8s
Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

[CV 4/5] END base_score=0.75, booster=gblinear, learning_rate=0.15, max_depth=3, n_estimators=200;, score=(train=-1135447671.711, test=-802723989.687) total time=   0.8s
[CV 3/5] END base_score=0.75, booster=gblinear, learning_rate=0.15, max_depth=3, n_estimators=200;, score=(train=-1020588178.880, test=-1352317323.666) total time=   0.9s
Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

[CV 5/5] END base_score=0.75, booster=gblinear, learning_rate=0.05, max_depth=7, n_estimators=200;, score=(train=-905331131.857, test=-2471629947.844) total time=   0.8s
Parameters: { "max_depth" } are not used.

[CV 4/5] END base

In [388]:
from sklearn.metrics import mean_squared_error
def get_score(model, X, y):
    model.fit(X, y.ravel())

    # Получаем предсказания на тестовых данных
    y_pred = model.predict(X)

    # Вычисляем среднеквадратическую ошибку (MSE)
    mse = mean_squared_error(y, y_pred)

    # Получаем корень из MSE, чтобы получить RMSE
    rmse = np.sqrt(mse)

    return f'Значение метрики RMSE: {rmse}'

In [390]:
get_score(random_cv, X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "max_depth" } are not used.

[CV 1/5] END base_score=0.75, booster=gblinear, learning_rate=0.15, max_depth=3, n_estimators=200;, score=(train=-1152251896.967, test=-750821673.378) total time=   0.7s
Parameters: { "max_depth" } are not used.

[CV 5/5] END base_score=0.75, booster=gblinear, learning_rate=0.15, max_depth=3, n_estimators=200;, score=(train=-846296241.560, test=-2396137685.313) total time=   0.8s
Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

[CV 2/5] END base_score=0.75, booster=gblinear, learning_rate=0.05, max_depth=7, n_estimators=200;, score=(train=-1125553233.843, test=-1183858266.040) total time=   1.2s
Parameters: { "max_depth" } are not used.

[CV 4/5] END base_score=0.75, booster=gblinear, learning_rate=0.15, max_depth=3, n_estimators=200;, score=(train=-1135447671.711, test=-802723989.687) total time=   0.7s
Parameters: { "max_depth" } are not used.

[CV 1/5] END base_score=1, booster=gblinear, learning_rate=0.

'Значение метрики RMSE: 16191.099143244117'

In [371]:
# df_last_40 = subm_df[subm_cols[40:]].copy()
# # удаляем колонки
# df_last_40 = df_last_40.drop(columns=col_2_delete)

# # one-hot encoding
# df_last_40 = pd.get_dummies(df_last_40, columns=col_2_encoding)

# # Применяем StandartScaler: после его применения у всех колонок среднее станет равно 0, стандартное отклонение 1
# ss_scaler = StandardScaler()
# df_last_40[col_2_normalize] = ss_scaler.fit_transform(df_last_40[col_2_normalize])

In [372]:
# df_last_40.shape

In [373]:
# df_last_40.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [374]:
# Сохраняем DataFrame в CSV-файл
# df_last_40.to_csv('subm_df_last_40.csv', index=False)

In [375]:
# df_num = df.select_dtypes(include = ['float64', 'int64'])
# df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [376]:
# def res_2_csv(id_column, predictions_column, file_name='submission.csv'):
#     # Создаем DataFrame из переданных столбцов
#     df = pd.DataFrame({ 'Id': id_column, 'SalePrice': predictions_column })
#     # Сохраняем DataFrame в CSV-файл
#     df.to_csv(file_name, index=False)

# # res_2_csv(test['Id'], y_hat)