In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
# Load preprocessed data from CSV
pre_proccessed_data = pd.read_csv('CSV FILE NAME.csv')  # Replace with your CSV file name

# Separate features (X) and target (y)
X = pre_processed_data.drop(columns=['price']) # Assuming 'price' is the target variable
y = pre_processed_data['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)

# Fit Models
linear_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Function to evaluate model
def evaluate_model(model, X, y, kf):
  r2_scores = []
  mse_scores = []
  
  for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2_scores.append(r2_score(y_test, y_pred))
    mse_scores.append(mean_squared_error(y_test, y_pred))
  
  return r2_scores, mse_scores

In [None]:
# Evaluate Linear Regression
lr_r2_scores, lr_mse_scores = evaluate_model(linear_model, X, y, kf)

# Evaluate Random Forest Regressor
rf_r2_scores, rf_mse_scores = evaluate_model(random_forest_model, X, y, kf)

In [None]:
# Print results
print("Linear Regression:")
print(f"R² Scores: {lr_r2_scores}")
print(f"Mean R²: {sum(lr_r2_scores) / len(lr_r2_scores)}")
print(f"MSE Scores: {lr_mse_scores}")
print(f"Mean MSE: {sum(lr_mse_scores) / len(lr_mse_scores)}")

print("\nRandom Forest Regressor:")
print(f"R² Scores: {rf_r2_scores}")
print(f"Mean R²: {sum(rf_r2_scores) / len(rf_r2_scores)}")
print(f"MSE Scores: {rf_mse_scores}")
print(f"Mean MSE: {sum(rf_mse_scores) / len(rf_mse_scores)}")

### Explanation of R² and MSE

**R² (Coefficient of Determination):**  
R² measures how well the model's predictions match the actual data. It represents the proportion of the variance in the dependent variable that is predictable from the independent variables.  
- **Range:** 0 to 1 (sometimes negative if the model performs worse than a simple mean prediction).  
- **Interpretation:** A higher R² indicates a better fit of the model to the data.  

**MSE (Mean Squared Error):**  
MSE measures the average squared difference between the predicted and actual values. It quantifies the error of the model's predictions.  
- **Range:** 0 to ∞ (lower is better).  
- **Interpretation:** A smaller MSE indicates that the model's predictions are closer to the actual values.  

### Why We Use Them
- **R²** helps us understand the proportion of variance explained by the model, providing a measure of goodness-of-fit.  
- **MSE** gives a direct measure of prediction error, helping us evaluate how far off the predictions are on average.  

By using both metrics, we gain a comprehensive understanding of the model's performance in terms of both accuracy and explanatory power.