# Lab 2

In [1]:
%pip install plotly


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import plotly.graph_objects as go

## Utility Functions

In [5]:
def draw_plot(X, y, title=""):
    fig = go.Figure(data=go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', marker=dict(color=y)))
    fig.update_layout(
        title=title,
        xaxis_title="X1",
        yaxis_title="X2",
        showlegend=False
    )
    fig.show()

## Task 1

## Task 2

### Synthetic Data for Naive Bayes

In [45]:
def synthetic_dataset_nb(n_samples=500):
    """
    Generate a synthetic dataset for Naive Bayes.
    Description:
    - Linearly separable classes
    - No redundant features
    - No correlation between features
    """
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples, 2))
        # Add some noise to make the classes more separable
        X_class += np.random.normal(scale=0.5, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    return X, y
    

### Synthetic Data for Logistic Regression

In [1]:
def synthetic_dataset_lr(n_samples=500, correlation=0.3, noise_scale=0.5):
    """Generate a synthetic dataset for Logistic Regression.
    Description: 
    - No feature independence assumption
    - Linear separability
    """
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples, 2))
        # Add some noise to make the classes less separable
        X_class += np.random.normal(scale=noise_scale, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

### Synthetic Data for Support Vector Machine (Linear Kernel)

In [9]:
def synthetic_dataset_svm(n_samples=500, correlation=0.5):
    """Generate a synthetic dataset for Support Vector Machine (Linear Kernel).
    Description: 
    - No feature independence assumption
    - Linear separability
    - Doesnt handle well overlapping classes. So let's define less noise scale.
    """

    # Noise scale
    NOISE_SCALE = 0.3
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples, 2))
        # Add some noise to make the classes less separable
        X_class += np.random.normal(scale=NOISE_SCALE, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

## Drawing data

In [13]:
X,y = synthetic_dataset_nb(n_samples=200)
print("Correlation:", np.corrcoef(X[:,0], X[:,1])[0,1])
# Plotting the data
draw_plot(X,y,"Synthetic dataset for Naive Bayes")


NameError: name 'synthetic_dataset_nb' is not defined

In [12]:
X,y = synthetic_dataset_lr(n_samples=200, correlation=-0.4, noise_scale=0.7)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Logistic Regression")


Correlation: -0.39235141941328955


In [11]:
X, y = synthetic_dataset_svm(n_samples=200, correlation=0.8)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for SVM (Linear Kernel)")

Correlation: 0.7963796782540807
