In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

# Загрузка данных
train = pd.read_csv('prices_train.csv')

# Подготовка данных для обучения (строки без пропусков)
train_data = train.dropna()

# Базовые признаки для каждой переменной
X2_features = ['X1 transaction date', 'X5 latitude', 'X6 longitude', 'Y house price of unit area']
X3_features = ['X1 transaction date', 'X2 house age', 'X5 latitude', 'X6 longitude', 'Y house price of unit area']
X4_features = ['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 
               'X5 latitude', 'X6 longitude', 'Y house price of unit area']

# Создаем полиномиальные признаки
poly_X2 = PolynomialFeatures(degree=2, include_bias=False)
poly_X3 = PolynomialFeatures(degree=2, include_bias=False)

# Для X2
X2_poly_features = poly_X2.fit_transform(train_data[X2_features])

# Для X3 (используем только строки, где X2 не пропущен)
X3_train_data = train.dropna(subset=['X2 house age', 'X3 distance to the nearest MRT station'])
X3_poly_features = poly_X3.fit_transform(X3_train_data[X3_features])

# Для X4 используем линейные признаки + взаимодействия широты и долготы
X4_train_data = train.dropna(subset=['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores'])
X4_train_features = X4_train_data[X4_features].copy()
X4_train_features['lat_lon'] = X4_train_data['X5 latitude'] * X4_train_data['X6 longitude']
X4_train_features['dist_stores'] = np.log1p(X4_train_data['X3 distance to the nearest MRT station'])

# Модель для X2 с регуляризацией Ridge
X2_ridge_model = Ridge(alpha=10.0)
X2_ridge_model.fit(X2_poly_features, train_data['X2 house age'])

# Модель для X3 с регуляризацией Ridge
X3_ridge_model = Ridge(alpha=1.0)
X3_ridge_model.fit(X3_poly_features, np.log1p(X3_train_data['X3 distance to the nearest MRT station']))

# Модель для X4
X4_model = Ridge(alpha=0.5)
X4_model.fit(X4_train_features, X4_train_data['X4 number of convenience stores'])

# Заполнение пропусков
missing_X2 = train['X2 house age'].isna()
missing_X3 = train['X3 distance to the nearest MRT station'].isna()
missing_X4 = train['X4 number of convenience stores'].isna()

train_filled = train.copy()

# Трансформация данных с пропусками для X2
X2_missing_poly = poly_X2.transform(train.loc[missing_X2, X2_features])

# Предсказания для X2
train_filled.loc[missing_X2, 'X2 house age'] = X2_ridge_model.predict(X2_missing_poly)

# Теперь заполняем X3 (уже с заполненным X2)
# Подготавливаем данные для предсказания X3
X3_missing_data = train_filled.loc[missing_X3, X3_features].copy()
X3_missing_poly = poly_X3.transform(X3_missing_data)

# Предсказания для X3
train_filled.loc[missing_X3, 'X3 distance to the nearest MRT station'] = np.expm1(
    X3_ridge_model.predict(X3_missing_poly)
)

# Заполняем X4 (уже с заполненными X2 и X3)
# Подготавливаем данные для предсказания X4
X4_missing_features = train_filled.loc[missing_X4, X4_features].copy()
X4_missing_features['lat_lon'] = train_filled.loc[missing_X4, 'X5 latitude'] * train_filled.loc[missing_X4, 'X6 longitude']
X4_missing_features['dist_stores'] = np.log1p(train_filled.loc[missing_X4, 'X3 distance to the nearest MRT station'])

# Предсказания для X4
train_filled.loc[missing_X4, 'X4 number of convenience stores'] = np.round(
    X4_model.predict(X4_missing_features)
).clip(0)

# Сохраняем заполненный датасет
train_filled.to_csv('prices_train_filled.csv', index=False)