<a href="https://colab.research.google.com/github/tejasri2005/Machine-Learning/blob/main/bias%2Cvariance%2Ccross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 1: Read and Preprocess Dataset
def load_dataset():
    data = {
        'Age': [25, 30, 35, 40, 25, 30, 35, 40],
        'Salary': [20000, 25000, 30000, 35000, 20000, 25000, 30000, 35000],
        'Purchased': [0, 1, 0, 1, 0, 1, 0, 1]
    }
    dataset = pd.DataFrame(data)
    print("Original Dataset:\n", dataset)

    # Remove Duplicates
    dataset = dataset.drop_duplicates()
    print("\nDataset after Removing Duplicates:\n", dataset)
    return dataset

# Step 2: Bias and Variance Calculation
def calculate_bias_variance(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    # Training Predictions
    y_train_pred = model.predict(X_train)
    train_bias = np.mean((y_train - y_train_pred) ** 2)

    # Testing Predictions
    y_test_pred = model.predict(X_test)
    variance = np.var(y_test_pred)

    print("\nBias (Training Error):", train_bias)
    print("Variance (Testing Error):", variance)

# Step 3: Cross Validation
def perform_cross_validation(model, X, y):
    folds = min(5, len(X))  # Use the minimum between 5 and dataset size
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    avg_score = np.mean(scores) * -1
    print("\nAverage Cross Validation Score (MSE):", avg_score)

# Main Function
if __name__ == "__main__":
    # Load Dataset
    dataset = load_dataset()

    # Splitting Dataset
    X = dataset[['Age', 'Salary']]
    y = dataset['Purchased']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Model Selection
    model = LinearRegression()

    # Bias and Variance Calculation
    calculate_bias_variance(model, X_train, X_test, y_train, y_test)

    # Cross Validation
    perform_cross_validation(model, X, y)


Original Dataset:
    Age  Salary  Purchased
0   25   20000          0
1   30   25000          1
2   35   30000          0
3   40   35000          1
4   25   20000          0
5   30   25000          1
6   35   30000          0
7   40   35000          1

Dataset after Removing Duplicates:
    Age  Salary  Purchased
0   25   20000          0
1   30   25000          1
2   35   30000          0
3   40   35000          1

Bias (Training Error): 0.0
Variance (Testing Error): 0.0

Average Cross Validation Score (MSE): 0.5895691609977323
