In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split

In [2]:
def load_and_preprocess_data(file_path: str):
    # Load the data
    df = pd.read_csv(file_path)
    
    # Handle categorical features
    categorical_features = [x for x in df.columns if df[x].dtype == 'object']
    
    # Split the data
    features = df.columns.drop("price")
    target = "price"
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], train_size=0.85, test_size=0.15, random_state=1234)
    
    # One Hot Encoding
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe.fit(X_train[categorical_features])
    
    def apply_ohe(ohe: OneHotEncoder, X: pd.DataFrame, cat_features: list):
        encoded_array = ohe.transform(X[cat_features])
        encoded_df = pd.DataFrame(
            encoded_array, 
            columns=ohe.get_feature_names_out(cat_features)
        )
        X.drop(columns=cat_features, inplace=True)
        X = pd.concat(
            [
                X.reset_index(drop=True), 
                encoded_df.reset_index(drop=True)
            ], 
            axis=1
        )
        return X
    
    X_train = apply_ohe(ohe, X_train.copy(), categorical_features)
    X_test = apply_ohe(ohe, X_test.copy(), categorical_features)
    
    return X_train, X_test, y_train, y_test

#### Cross-Validation Example for House Pricing Model

Notes:
- Cross-validation is a statistical method used to estimate the performance of machine learning models.
- It involves splitting the dataset into multiple subsets (folds) and training/testing the model on different combinations of these subsets.
- This helps ensure that the model performs well on unseen data and reduces the risk of overfitting.

Why Cross-Validation is Important:
1. Provides a more reliable estimate of model performance compared to a single train-test split.
2. Helps identify overfitting or underfitting issues.
3. Ensures the model generalizes well to unseen data.

In [3]:
X_train, X_test, y_train, y_test = load_and_preprocess_data("./data/Housing.csv")

# Initialize the model
model = LinearRegression()

# Perform cross-validation
# Using 5-fold cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

# Convert negative RMSE to positive RMSE
# Explanation:
# In scikit-learn, the scoring parameter for cross_val_score expects higher values to indicate better performance.
# Since RMSE is a metric where lower values are better, it is negated during scoring to align with this convention.
# By negating the RMSE scores, scikit-learn can rank models correctly during cross-validation.
rmse_scores = -cv_scores

# Print results
print("Cross-Validation Scores (RMSE):", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard Deviation of RMSE:", rmse_scores.std())

# Notes:
# In this example, we used 5-fold cross-validation to evaluate the LinearRegression model.
# The dataset was split into 5 folds, and the model was trained on 4 folds while being tested on the remaining fold.
# This process was repeated 5 times, and the mean squared error (MSE) was calculated for each fold.
# The mean and standard deviation of the MSE provide insights into the model's performance and stability.

Cross-Validation Scores (RMSE): [1160388.30145678  925978.19039612 1055761.77560161 1139526.75245282
 1154993.4021494 ]
Mean RMSE: 1087329.6844113462
Standard Deviation of RMSE: 89074.47627959249
