In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import tqdm
from sklearn.model_selection import ParameterGrid

train_data_path = pd.read_csv('train.csv')
test_data_path = pd.read_csv('test.csv')

features = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

X_tr = pd.get_dummies(train_data_path[features])
y_tr = train_data_path['SalePrice']
X_test = pd.get_dummies(test_data_path[features])

# Преобразование имен столбцов в строки
X_tr.columns = X_tr.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

merged_data = pd.concat([X_tr, X_test], ignore_index=True)

imputer = KNNImputer(n_neighbors=5, weights='distance')
merged_data_imputed = pd.DataFrame(imputer.fit_transform(merged_data), columns=merged_data.columns)

X_tr_imputed = merged_data_imputed.iloc[:len(X_tr)]
y_tr_filtered = y_tr[X_tr_imputed.index]
X_test_imputed = merged_data_imputed.iloc[len(X_tr):]

scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

param_grid = {
    'n_estimators': [50, 250, 500],
    'max_depth': [3, 8, 13],
    'learning_rate': [0.1, 0.05, 0.01],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1]
}

model = GradientBoostingRegressor(random_state=1)
grid_search = GridSearchCV(model, param_grid, cv=5)
# grid_search.fit(X_tr_scaled, y_tr_filtered)
for params in tqdm.tqdm(list(ParameterGrid(param_grid))):
    model.set_params(**params)
    model.fit(X_tr_scaled, y_tr_filtered)

best_params = None
best_mse = float('inf')

for params in tqdm.tqdm(list(ParameterGrid(param_grid))):
    model.set_params(**params)
    model.fit(X_tr_scaled, y_tr_filtered)
    
    y_pred = model.predict(X_tr_scaled)
    mse = mean_squared_error(y_tr_filtered, y_pred)
    
    if mse < best_mse:
        best_mse = mse
        best_params = params

best_model = GradientBoostingRegressor(**best_params)
best_model.fit(X_tr_scaled, y_tr_filtered)

y_pred = best_model.predict(X_tr_scaled)
mse = mean_squared_error(y_tr_filtered, y_pred)
print('Training MSE:', mse)

# Применение метода predict() наилучшей модели best_model к данным X_test_scaled для получения 
# предсказания выживаемости пассажиров на тестовом наборе данных
predictions = best_model.predict(X_test_scaled)
output = pd.DataFrame({
    'Id': test_data_path.Id,
    'SalePrice': predictions
})

output.to_csv('predictions.csv', index=False)

 48%|▍| 350/729 [20:16<37:07,  5.88s

In [None]:
# features = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Загрузка данных из файлов
train_data_path = pd.read_csv('train.csv')
test_data_path = pd.read_csv('test.csv')

# Предварительная обработка данных

# Отделяем колонку SalePrices от остальных признаков в таблице train.csv
X_train = train_data_path.drop(
    'SalePrice', 
    axis=1
)
y_train = train_data_path['SalePrice']

# Применение метода One-Hot Encoding для обработки категориальных переменных (преобразование НЕ числовых значение в числовые)
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(test_data_path)

# Удаляем стобцы со значениями NaN
X_train_dropna = X_train_encoded.dropna(axis=1)
X_test_dropna = X_test_encoded.dropna(axis=1)

# Выравнивание колонок в X_train_encoded и X_test_encoded
X_train_dropna, X_test_dropna = X_train_dropna.align(
    X_test_dropna, 
    join='inner', 
    axis=1
)

# Создание и обучение модели линейной регрессии
model = LinearRegression()
model.fit(X_train_dropna, y_train)

# Предсказание на данных test.csv
# X_test = test_data_path.drop('SalePrice', axis=1)
y_predictions = model.predict(X_test_dropna)

# Оценка качества предсказаний
# mae = mean_absolute_error(y_true, y_predicted)
# print('MAE:', mae)

# Сохранение результатов
id_column = test_data_path['Id']
predictions = pd.DataFrame({
    'Id': id_column, 
    'SalePrice': y_predictions
})
predictions.to_csv(
    'predictions.csv', index = False
)