In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
# Load preprocessed data from CSV
linear_regressor_data = pd.read_csv('../processed_data/linear_data.csv')
random_forest_data = pd.read_csv('../processed_data/linear_data.csv')

# Separate features (X) and target (y)
lr_X = linear_regressor_data.drop(columns=['price']) # Assuming 'price' is the target variable
lr_y = linear_regressor_data['price']

rf_X = random_forest_data.drop(columns=['price']) # Assuming 'price' is the target variable
rf_y = random_forest_data['price']

# Split data into training and testing sets
lr_X_train, lr_X_test, lr_y_train, lr_y_test = train_test_split(lr_X, lr_y, test_size=0.2, random_state=42)
rf_X_train, rf_X_test, rf_y_train, rf_y_test = train_test_split(lr_X, lr_y, test_size=0.2, random_state=42)

In [5]:
# Initialize models
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)

# Fit Models
linear_model.fit(lr_X_train, lr_y_train)
random_forest_model.fit(rf_X_train, rf_y_train)

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
# Evaluate Linear Regression on test data
lr_y_pred = linear_model.predict(lr_X_test)
lr_test_r2 = r2_score(lr_y_test, lr_y_pred)
lr_test_mse = mean_squared_error(lr_y_test, lr_y_pred)

# Evaluate Random Forest Regressor on test data
rf_y_pred = random_forest_model.predict(rf_X_test)
rf_test_r2 = r2_score(rf_y_test, rf_y_pred)
rf_test_mse = mean_squared_error(rf_y_test, rf_y_pred)

# Print results
print("Linear Regression on Test Data:")
print(f"Predictions: {lr_y_pred}")
print(f"Avg Predictions: {lr_y_pred.mean()}")
print(f"R²: {lr_test_r2}")
print(f"MSE: {lr_test_mse}")

print("\nRandom Forest Regressor on Test Data:")
print(f"Predictions: {rf_y_pred}")
print(f"Avg Predictions: {rf_y_pred.mean()}")
print(f"R²: {rf_test_r2}")
print(f"MSE: {rf_test_mse}")

Linear Regression on Test Data:
Predictions: [12.7431303  11.42058298  5.98541763 ... 18.25645164  8.24065532
 12.12793595]
Avg Predictions: 16.5467784013855
R²: 0.9284076244114665
MSE: 6.241453558623612

Random Forest Regressor on Test Data:
Predictions: [ 7.76428725 10.7824511   7.54506381 ... 16.19674329  9.09728283
 10.66570347]
Avg Predictions: 16.541862340153997
R²: 0.9698302209967633
MSE: 2.630214083199093


In [7]:
# Perform K-Fold cross-validation for Linear Regression
lr_r2_scores = cross_val_score(linear_model, lr_X, lr_y, scoring='r2')
lr_mse_scores = -cross_val_score(linear_model, lr_X, lr_y, scoring='neg_mean_squared_error')

In [8]:
# Perform K-Fold cross-validation for Random Forest Regressor
rf_r2_scores = cross_val_score(random_forest_model, rf_X, rf_y, cv=kf, scoring='r2')
rf_mse_scores = -cross_val_score(random_forest_model, rf_X, rf_y, cv=kf, scoring='neg_mean_squared_error')

In [10]:
# Print results
print("Linear Regression:")
print(f"R² Scores: {lr_r2_scores}")
print(f"Mean R²: {sum(lr_r2_scores) / len(lr_r2_scores)}")
print(f"MSE Scores: {lr_mse_scores}")
print(f"Mean MSE: {sum(lr_mse_scores) / len(lr_mse_scores)}")

print("\nRandom Forest Regressor:")
print(f"R² Scores: {rf_r2_scores}")
print(f"Mean R²: {sum(rf_r2_scores) / len(rf_r2_scores)}")
print(f"MSE Scores: {rf_mse_scores}")
print(f"Mean MSE: {sum(rf_mse_scores) / len(rf_mse_scores)}")

Linear Regression:
R² Scores: [0.92807045 0.92908097 0.92748786 0.92885245 0.92796395]
Mean R²: 0.9282911379560372
MSE Scores: [6.31092157 6.14750265 6.34246457 6.14178321 6.23093449]
Mean MSE: 6.234721297091413

Random Forest Regressor:
R² Scores: [0.96983177 0.96971116 0.96964991 0.9687842  0.9683056 ]
Mean R²: 0.9692565277360965
MSE Scores: [2.63007928 2.63242562 2.63194557 2.69928566 2.77118487]
Mean MSE: 2.672984200502959


### Explanation of R² and MSE

**R² (Coefficient of Determination):**  
R² measures how well the model's predictions match the observed data. It represents the proportion of the variance in the dependent variable that is explained by the independent variables.  
- **Range:** 0 to 1 (sometimes negative if the model performs worse than a simple mean prediction).  
- **Interpretation:** A higher R² indicates a better fit of the model to the data.  

**MSE (Mean Squared Error):**  
MSE measures the average squared difference between the predicted and actual values. It quantifies the error of the model's predictions.  
- **Range:** 0 to ∞ (lower is better).  
- **Interpretation:** A smaller MSE when compared to another model indicates that the model's predictions are closer to the actual values.  

### Why We Use Them
- **R²** helps us understand the proportion of variance explained by the model, providing a measure of goodness-of-fit.  
- **MSE** gives a direct measure of prediction error, helping us evaluate how far off the predictions are on average.  

By using both metrics, we gain a comprehensive understanding of the model's performance in terms of both accuracy and explanatory power.

### Result Analysis

**R² Results:**  
Both models achieved strong R² scores, indicating a good fit to the data. However, the **Random Forest Regression model slightly outperformed the Linear Regression model**, suggesting it captures more of the variance in fare prices.  
- **Linear Regression Mean R²:** 0.928  
- **Random Forest Regression Mean R²:** 0.969  

This indicates that while Linear Regression explains about 92.8% of the variance in fare prices, Random Forest explains approximately 96.9%, making it a better fit for this dataset.

**MSE Results:**  
When comparing prediction error, Random Forest also outperformed Linear Regression by a noticeable margin.  
- **Linear Regression Mean MSE:** 6.235  
- **Random Forest Regression Mean MSE:** 2.673  

This shows that, on average, Random Forest's predictions are significantly closer to the actual fare prices, with an error reduction of roughly **57%** compared to Linear Regression. The lower MSE confirms that Random Forest provides more accurate predictions for this problem.
