In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

def create_features(df):
    df = df.copy()
    # Location-based features
    df['location_proximity'] = np.sqrt(df['longitude']**2 + df['latitude']**2)
    
    # Housing density features
    df['rooms_per_household'] = df['total_rooms']/df['households']
    df['population_density'] = df['population']/df['households']
    df['bedrooms_ratio'] = df['total_bedrooms']/df['total_rooms']
    
    # Economic indicators
    df['income_per_capita'] = df['median_income']/df['population']
    
    return df

# Load and prepare data
train_data = pd.read_csv('../ynov-data/train_housing_train.csv')
valid_data = pd.read_csv('../ynov-data/train_housing_valid.csv')

# Feature engineering
X_train = create_features(train_data.drop(['median_house_value', 'id'], axis=1))
y_train = train_data['median_house_value']

# Identify features
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()),
    ('power', PowerTransformer(method='yeo-johnson'))
])

# Modified categorical transformer
categorical_transformer = TargetEncoder()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Create and train model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

# Train
train_predictions = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
train_r2 = r2_score(y_train, train_predictions)

print(f'Training RMSE: {train_rmse:.2f}')
print(f'Training R²: {train_r2:.2f}')

# Validate
X_valid = create_features(valid_data.drop(['median_house_value', 'id'], axis=1))
y_valid = valid_data['median_house_value']
valid_predictions = model.predict(X_valid)

rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
r2 = r2_score(y_valid, valid_predictions)

print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.2f}')

# Generate predictions
test_data = pd.read_csv('../ynov-data/test_housing.csv')
X_test = create_features(test_data.drop('id', axis=1))
test_predictions = model.predict(X_test)

submission = pd.DataFrame({
    'id': test_data['id'],
    'median_house_value': test_predictions
})
submission.to_csv('../ynov-data/submission.csv', index=False)