# Lab 2

In [None]:
%pip install plotly

In [18]:
import numpy as np
import plotly.graph_objects as go
from sklearn.datasets import make_classification, make_circles, make_moons, make_blobs

## Utility Functions

In [6]:
def draw_plot(X, y, title=""):
    fig = go.Figure(data=go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', marker=dict(color=y)))
    fig.update_layout(
        title=title,
        xaxis_title="X1",
        yaxis_title="X2",
        showlegend=False
    )
    fig.show()

## Task 1

## Task 2

### Synthetic Data for Naive Bayes

In [8]:
def synthetic_dataset_nb(n_samples=500):
    """
    Generate a synthetic dataset for Naive Bayes.
    Description:
    - Linearly separable classes
    - No redundant features
    - No correlation between features
    """
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples, 2))
        # Add some noise to make the classes more separable
        X_class += np.random.normal(scale=0.5, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    return X, y
    

### Synthetic Data for Logistic Regression

In [None]:
def synthetic_dataset_lr(n_samples=500, correlation=0.3, noise_scale=0.5):
    """Generate a synthetic dataset for Logistic Regression.
    Description: 
    - No feature independence assumption
    - Linear separability
    """
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples, 2))
        # Add some noise to make the classes less separable
        X_class += np.random.normal(scale=noise_scale, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

### Synthetic Data for Support Vector Machine (Linear Kernel)

In [None]:
def synthetic_dataset_svm(n_samples=500, correlation=0.5):
    """Generate a synthetic dataset for Support Vector Machine (Linear Kernel).
    Description: 
    - No feature independence assumption
    - Linear separability
    - Doesnt handle well overlapping classes. So let's define less noise scale.
    """

    # Noise scale
    NOISE_SCALE = 0.3
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples, 2))
        # Add some noise to make the classes less separable
        X_class += np.random.normal(scale=NOISE_SCALE, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

### Synthetic Data for Decision Trees

In [None]:
def synthetic_dataset_dt(n: int =600, noise: float =0.2) -> tuple[np.ndarray, np.ndarray]:
    """
    Generate a synthetic dataset for Decision Trees.
    Description: 
    XOR structure: for correlation.
    NB independence assumption fails; Tree captures interaction well.
    """
    # Four Gaussian blobs in quadrants
    n_q = n // 4
    class_means = [(-1.5, -1.5), (1.5, 1.5), (-1.5, 1.5), (1.5, -1.5)]
    data = []
    labels = []
    for i, (cx, cy) in enumerate(class_means):
        # generate 2D points (not 4D)
        pts = np.random.normal(loc=[cx, cy], scale=noise, size=(n_q, 2))
        data.append(pts)
        labels.append(np.full(n_q, i))  # class 0..3
    X = np.vstack(data)
    y = np.hstack(labels)
    return X, y

### Synthetic Data for Decision Rules

In [15]:
def synthetic_dataset_dr(n: str =600, noise: str =0.05) -> tuple[np.ndarray, np.ndarray]:
    """
    Generate a synthetic dataset for Decision Rules.
    Axis-aligned rectangular rules:
      - Class 0: x1 <= 0 and x2 <= 0
      - Class 1: x1 > 0 and x2 <= 0
      - Class 2: x1 <= 0 and x2 > 0
      - Class 3: x1 > 0 and x2 > 0
    Shallow trees (decision rules) capture rectangles cleanly.
    """
    # Uniform points in 2D space
    X = np.random.uniform(low=-2.0, high=2.0, size=(n, 2))

    # Assign class based on axis-aligned rules (quadrants)
    y = np.zeros(n, dtype=int)
    y[(X[:,0] > 0) & (X[:,1] <= 0)] = 1
    y[(X[:,0] <= 0) & (X[:,1] > 0)] = 2
    y[(X[:,0] > 0) & (X[:,1] > 0)] = 3

    # Add noise: flip labels for a fraction of samples
    idx = np.random.choice(n, size=int(noise*n), replace=False)
    y[idx] = np.random.randint(0, 4, size=len(idx))

    return X, y

### Synthetic Data for Nearest Neighbor

In [None]:
def synthetic_dataset_knn(n: int = 600, std: float = 0.6, random_state: int = 42) -> tuple[np.ndarray, np.ndarray]:
    """
    Synthetic dataset designed to favor kNN over SVM.
    - kNN advantages:
    Each class = two blobs, so decision boundaries depend on local clusters.
    kNN leverages this local structure effectively.
    - SVM disadvantages:
    The same class isn’t contiguous in feature space → SVM with RBF tries to draw a single smooth region and struggles.
    Moderate overlap makes margins fuzzy.
    """
    rng = np.random.default_rng(random_state)
    
    centers = [
        (-3, -2), (-2, -3),   # class 0 (two nearby blobs)
        ( 3, -2), ( 2, -3),   # class 1
        (-3,  2), (-2,  3),   # class 2
        ( 3,  2), ( 2,  3)    # class 3
    ]
    
    # Generate 8 blobs total
    X, y_blobs = make_blobs(n_samples=n, centers=centers, cluster_std=std, random_state=random_state)
    
    # Map blob indices (0..7) to class labels (0..3)
    mapping = {0:0, 1:0, 2:1, 3:1, 4:2, 5:2, 6:3, 7:3}
    y = np.vectorize(mapping.get)(y_blobs)
    
    return X, y

## Drawing data

In [None]:
X,y = synthetic_dataset_nb(n_samples=200)
print("Correlation:", np.corrcoef(X[:,0], X[:,1])[0,1])
# Plotting the data
draw_plot(X,y,"Synthetic dataset for Naive Bayes")


In [None]:
X,y = synthetic_dataset_lr(n_samples=200, correlation=-0.4, noise_scale=0.7)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Logistic Regression")


In [None]:
X, y = synthetic_dataset_svm(n_samples=200, correlation=0.8)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for SVM (Linear Kernel)")

In [None]:
X, y = synthetic_dataset_dt(n=800, noise=0.7)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Decision Tree")

In [None]:
X, y = synthetic_dataset_dr(n=800, noise=0.01)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Decision Rules")

In [22]:
X, y = synthetic_dataset_knn(n=800, std=0.8, random_state=42)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Nearest Neighbor")

Correlation: 0.013106208642306693
