In [1]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd

In [2]:
# Load the Datasets
train = pd.read_csv("train_simp.csv")
test = pd.read_csv("test_simp.csv")
sample_submission = pd.read_csv("../home-data-for-ml-course/sample_submission.csv")

# Define Features (X) and Target (y)
y_train = train["SalePrice"]  # Target variable
X_train = train.drop(columns=["SalePrice"])  # Remove target from train data
X_test = test.copy()  # Keep test dataset separate
y_test = sample_submission["SalePrice"]  # Ground Truth for Test Set

# Hyperparameter Grid for SVM
param_grid = {
    "C": [0.1, 1, 10],
    "epsilon": [0.01, 0.05, 0.1],  # Fine-tune epsilon
    "kernel": ["linear", "rbf", "poly"]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(
    SVR(),
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring="r2",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Get best hyperparameters
best_params = grid_search.best_params_
print("Best Parameters Found:", best_params)

# Train Model with Best Parameters
best_svm_model = SVR(
    C=best_params["C"],
    epsilon=best_params["epsilon"],
    kernel=best_params["kernel"]
)
best_svm_model.fit(X_train, y_train)

# Cross-Validation Score
cv_r2 = cross_val_score(best_svm_model, X_train, y_train, cv=5, scoring="r2").mean()
print(f"Cross-Validation R² Score: {cv_r2:.4f}")

# Evaluate Model on Training Data
y_train_pred = best_svm_model.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

print("\nModel Performance on Training Set:")
print(f"Training R² Score: {train_r2:.4f}")
print(f"Training MAE: {train_mae:.2f}")
print(f"Training RMSE: {train_rmse:.2f}")

# Evaluate Model on Test Set
y_test_pred = best_svm_model.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print("\nModel Performance on Test Set:")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")

# Generalization Gap Check
gap = abs(train_r2 - test_r2)
print(f"\nGeneralization Gap: {gap:.4f} (Difference between Train & Test R²)")

# Save Predictions
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": y_test_pred})
submission.to_csv("svr_simplified.csv", index=False)
print("Predictions saved to svr_simplified.csv")

Best Parameters Found: {'C': 10, 'epsilon': 0.1, 'kernel': 'linear'}
Cross-Validation R² Score: 0.5912


TypeError: got an unexpected keyword argument 'squared'