In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
import numpy as np

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extracting the target variable
y = train_data['price_doc']
train_data.drop(['price_doc'], axis=1, inplace=True)

# Preprocessing
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

# Parameter grid
poly_degrees = [1, 2, 3]
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]
pca_components = [0.85, 0.90, 0.95]

best_rmse = float('inf')
best_config = None
best_n_features = None

for scaler in scalers:
    scaled_train_data = scaler.fit_transform(train_data)
    scaled_test_data = scaler.transform(test_data)

    for degree in poly_degrees:
        poly = PolynomialFeatures(degree=degree, interaction_only=False, include_bias=False)
        X_poly = poly.fit_transform(scaled_train_data)

        for comp in pca_components:
            pca = PCA(n_components=comp)
            X_pca = pca.fit_transform(X_poly)

            X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

            model = LinearRegression()
            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            if rmse < best_rmse:
                best_rmse = rmse
                best_config = {'scaler': scaler, 'poly_degree': degree, 'pca_component': comp}
                best_n_features = X_train.shape[1]

print(f"Best RMSE: {best_rmse}")
print(f"Best Configuration: {best_config}")
print(f"Number of features used in the best model: {best_n_features}")

# Preprocess the test data with the best configuration
best_scaler = best_config['scaler']
best_poly = PolynomialFeatures(degree=best_config['poly_degree'], interaction_only=False, include_bias=False)
best_pca = PCA(n_components=best_config['pca_component'])

scaled_train_data = best_scaler.fit_transform(train_data)
scaled_test_data = best_scaler.transform(test_data)
X_poly_train = best_poly.fit_transform(scaled_train_data)
X_poly_test = best_poly.transform(scaled_test_data)
X_pca_train = best_pca.fit_transform(X_poly_train)
X_pca_test = best_pca.transform(X_poly_test)

# Train the final model
final_model = LinearRegression()
final_model.fit(X_pca_train, y)

# Generate predictions for the test set
final_predictions = final_model.predict(X_pca_test)

# Preparing the submission file
submission = pd.read_csv('sample_submission.csv')
submission['price_doc'] = final_predictions
submission.to_csv('submission.csv', index=False)


Number of features used: 5150
RMSE: 14574684.979716906
