In [None]:
import requests
import zipfile
import io
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
# URL for the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"

# Send a HTTP request to the URL of the webpage you want to access
response = requests.get(url)

# Create a ZipFile object from the response content
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

# Extract the 'day.csv' or 'hour.csv' file from the ZipFile object
csv_file = zip_file.open('day.csv')

In [None]:
# Read the CSV data
data = pd.read_csv(csv_file)

# Drop the 'dteday' column
data = data.drop('dteday', axis=1)

# Split the data into predictors and target
X = data.drop(['cnt', 'casual', 'registered'], axis=1)
y = data['cnt']

## The Validation Set Approach

In [None]:
from sklearn.model_selection import train_test_split

# Initial split: 60% training, 40% for combined validation and testing
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Number of iterations for random splitting
num_iterations = 20

# Randomly split the 40% data into validation and testing sets (50% each)
# Store the mean squared errors for each iteration
mse_scores = []


In [None]:
for i in range(num_iterations):
    # Randomly split the 40% data into validation and testing sets (50% each)
    X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=i)

    # Create and train the model
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_val = regressor.predict(X_val)

    # Calculate mean squared error on the validation set
    mse = mean_squared_error(y_val, y_pred_val)
    mse_scores.append(mse)

In [None]:
import matplotlib.pyplot as plt


# Plotting the MSE scores as a bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(mse_scores)+1), mse_scores, color='blue')
plt.xlabel('Iteration')
plt.ylabel('Mean Squared Error')
plt.title(f'MSE Scores for Different Validation Splits (Variance: {variance:.2f})')
plt.xticks(range(1, len(mse_scores)+1))
plt.savefig('pictures/validation_set_variance.pdf')
plt.show()

In [None]:
# Calculate the variance of MSE scores
variance = np.var(mse_scores)

print(f'MSE Scores: {mse_scores}')
print(f'Variance of MSE Scores: {variance}')

## Leave-One-Out

In [None]:
from sklearn.model_selection import LeaveOneOut

# Initialize LeaveOneOut cross-validator
loo = LeaveOneOut()

# Create a LinearRegression object
regressor = LinearRegression()

In [None]:
# To store the mean squared errors
mse_scores = []

# Perform LOOCV
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = regressor.predict(X_test)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

In [None]:
# Calculate the average mean squared error
average_mse = np.mean(mse_scores)
print(f'Average Mean Squared Error: {average_mse}')

## K-Fold

In [None]:
from sklearn.model_selection import KFold

# Initialize KFold cross-validator with 10 splits
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Create a LinearRegression object
regressor = LinearRegression()

# To store the mean squared errors
mse_scores = []

# Perform 10-Fold CV
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = regressor.predict(X_test)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

# Calculate the average mean squared error
average_mse = np.mean(mse_scores)
print(f'Average Mean Squared Error: {average_mse}')

## Bootstrap

In [None]:
# Original dataset
data_windspeed = data['windspeed'].values
data_windspeed.mean()

In [None]:
data_windspeed.shape

In [None]:
from sklearn.utils import resample

# Function to perform bootstrapping
def bootstrap_confidence_interval(data, n_bootstraps=5000, ci=95):
    bootstrap_means = []
    
    # Generate bootstrap samples and compute their means
    for _ in range(n_bootstraps):
        bootstrap_sample = resample(data, replace=True)
        bootstrap_means.append(np.mean(bootstrap_sample))
    
    # Compute the percentiles to form the confidence interval
    lower_percentile = (100 - ci) / 2
    upper_percentile = 100 - lower_percentile
    confidence_interval = np.percentile(bootstrap_means, [lower_percentile, upper_percentile])
    
    return confidence_interval, bootstrap_means


confidence_level = 95

# Calculate the confidence interval
confidence_interval, bootstrap_means = bootstrap_confidence_interval(data_windspeed, ci=confidence_level)

print(f"Estimated {confidence_level}% confidence interval for the mean: {confidence_interval}")


In [None]:
# Plotting the results
plt.figure(figsize=(6, 4))

# Histogram of bootstrap means
plt.hist(bootstrap_means, bins=30, color='blue', alpha=0.7, label='Bootstrap Means')

# Confidence interval
plt.axvline(confidence_interval[0], color='red', linestyle='--', label='95% CI Lower Bound')
plt.axvline(confidence_interval[1], color='green', linestyle='--', label='95% CI Upper Bound')

# Original sample mean
plt.axvline(data_windspeed.mean(), color='yellow', linestyle='-', label='Original Sample Mean')

plt.title('Bootstrap Means and Confidence Interval for the Mean')
plt.xlabel('Mean Value')
plt.ylabel('Frequency')
plt.legend(fontsize=12, loc='upper right', bbox_to_anchor=(1.6, 1))
plt.savefig('pictures/bootstrap_mean.pdf')
plt.show()

## Bootstrap for Model Evaluation

In [None]:
# Function to perform bootstrap for model evaluation
def bootstrap_model_evaluation(X_train, y_train, model, B=100, test_size=0.2):
    mse_scores = []

    # Split the data into training and validation sets
    X_training, X_validation, y_training, y_validation = train_test_split(
        X_train, y_train, test_size=test_size
    )

    for i in range(B):
        # Prepare the bootstrap sample
        X_sample, y_sample = resample(X_training, y_training)

        # Fit the model to the bootstrap sample
        model.fit(X_sample, y_sample)

        # Evaluate the model on the validation set
        y_pred = model.predict(X_validation)
        mse = mean_squared_error(y_validation, y_pred)
        mse_scores.append(mse)

    # Calculate bootstrap statistics
    mean_mse = np.mean(mse_scores)
    mse_ci_lower = np.percentile(mse_scores, 2.5)
    mse_ci_upper = np.percentile(mse_scores, 97.5)

    return mean_mse, mse_ci_lower, mse_ci_upper

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Instantiate the models
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor()

# Evaluate Linear Regression model using bootstrap
lr_mean_mse, lr_mse_ci_lower, lr_mse_ci_upper = bootstrap_model_evaluation(X_train, y_train, linear_model)

# Evaluate Decision Tree model using bootstrap
dt_mean_mse, dt_mse_ci_lower, dt_mse_ci_upper = bootstrap_model_evaluation(X_train, y_train, decision_tree_model)

# Print results for Linear Regression
print(f"Linear Regression - Bootstrap Mean MSE: {lr_mean_mse:.2f}")
print(f"Linear Regression - 95% Confidence interval for the MSE: [{lr_mse_ci_lower:.2f}, {lr_mse_ci_upper:.2f}]")

# Print results for Decision Tree
print(f"Decision Tree - Bootstrap Mean MSE: {dt_mean_mse:.2f}")
print(f"Decision Tree - 95% Confidence interval for the MSE: [{dt_mse_ci_lower:.2f}, {dt_mse_ci_upper:.2f}]")
