In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import classification_report, confusion_matrix
from fastai.tabular.all import *
from sklearn.model_selection import train_test_split

df = pd.read_csv("moonDataset.csv")


In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.sample(10)

In [None]:
df['label'].value_counts()

In [None]:
duplicated_rows_count = df.duplicated().sum()
print('Duplicated rows (row by row):', duplicated_rows_count)

In [None]:
df.describe()

In [None]:
df = df.rename(columns={
    "X1": "First moons feature",
    "X2": "Second moons feature",
    "X3": "Vertical displacement",
    "label": "class"
})

In [None]:
df.describe()

In [None]:
df["class"] = df["class"].map({0: "No", 1: "Yes"})
custom_palette = {"Yes": "salmon", "No": "skyblue"}

num_cols = [
    "First moons feature",
    "Second moons feature",
    "Vertical displacement"
]
cat_col = "class"

fig, axes = plt.subplots(len(num_cols), 4, figsize=(28, 36))

for i, col in enumerate(num_cols):
    sns.histplot(df[col], kde=True, ax=axes[i, 0], bins=10, color="lightblue")
    axes[i, 0].axvline(
        df[col].mean(),
        color="red",
        linestyle="--",
        label=f"Mean {col}: {df[col].mean():.2f}",
    )
    axes[i, 0].legend()
    axes[i, 0].set_title(f"Histogram of {col} with Mean")

    sns.boxplot(x=df[col], ax=axes[i, 1], color="lightblue")
    axes[i, 1].set_title(f"Boxplot of {col}")

    sns.histplot(
        data=df,
        x=col,
        hue=cat_col,
        kde=True,
        ax=axes[i, 2],
        bins=10,
        palette=custom_palette,
    )
    
    for grp, color in zip(["No", "Yes"], ["blue", "red"]):
        if grp in df[cat_col].unique():
            axes[i, 2].axvline(
                df[df[cat_col] == grp][col].mean(),
                color=color,
                linestyle="--",
                label=f"Mean {col} ({grp}): {df[df[cat_col] == grp][col].mean():.2f}",
            )
    axes[i, 2].legend()
    axes[i, 2].set_title(f"Histogram of {col} by {cat_col}")

    sns.boxplot(x=cat_col, y=col, hue=cat_col, data=df, ax=axes[i, 3], palette=custom_palette, legend=False)
    axes[i, 3].set_title(f"Boxplot of {col} by {cat_col}")

plt.tight_layout()
plt.show()

In [None]:
df["class"] = df["class"].map({"No": 0, "Yes": 1})
correlation_matrix = df.corr()

plt.figure(figsize=(12, 6))
num_features = len(correlation_matrix)
font_size = max(5, 40 // num_features)

sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    cbar=True,
    annot_kws={"size": font_size},
)

plt.title("Correlation Matrix", fontsize=20)
plt.xticks(fontsize=font_size)
plt.yticks(fontsize=font_size)
plt.show()

In [None]:
X = df.drop(columns=["class"])
y = df["class"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, train_size=0.6, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [None]:
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values if hasattr(X, 'values') else X, dtype=torch.float32)
        self.y = torch.tensor(y.values if hasattr(y, 'values') else y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TabularDataset(X_train, y_train)
val_dataset = TabularDataset(X_val, y_val)
test_dataset = TabularDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
class ClassificationModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ClassificationModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
def train_model(model, train_loader, val_loader, epochs=20, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        val_accuracy = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Val Acc: {val_accuracy:.4f}")

def evaluate(model, loader):
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(y_batch.tolist())
            y_pred.extend(predicted.tolist())

    return accuracy_score(y_true, y_pred)

In [None]:
input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = ClassificationModel(input_dim, output_dim)
train_model(model, train_loader, val_loader, epochs=30, lr=0.001)

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(y_batch.tolist())
        y_pred.extend(predicted.tolist())

print("Test Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

In [None]:
X_test_np = X_test.values if hasattr(X_test, 'values') else X_test
y_test_np = y_test if isinstance(y_test, np.ndarray) else np.array(y_test)

f0_min, f0_max = X_test_np[:, 0].min() - 0.1, X_test_np[:, 0].max() + 0.1
f1_min, f1_max = X_test_np[:, 1].min() - 0.1, X_test_np[:, 1].max() + 0.1

xx, yy = np.meshgrid(np.linspace(f0_min, f0_max, 300),
                     np.linspace(f1_min, f1_max, 300))

fixed_feature2 = np.mean(X_test_np[:, 2])

grid_input = np.c_[
    xx.ravel(),
    yy.ravel(),
    np.full(xx.ravel().shape, fixed_feature2)
]

In [None]:
model.eval()
with torch.no_grad():
    grid_tensor = torch.tensor(grid_input, dtype=torch.float32)
    outputs = model(grid_tensor)
    _, predictions = torch.max(outputs, 1)
    Z = predictions.numpy().reshape(xx.shape)

In [None]:
plt.figure(figsize=(10, 8))

plt.contourf(xx, yy, Z, cmap='Pastel1', alpha=0.6)

scatter = plt.scatter(X_test_np[:, 0], X_test_np[:, 1], c=y_test_np, cmap='tab10', edgecolor='k')
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.title("Decision Boundaries (Feature 2 fixed at mean)")

plt.legend(*scatter.legend_elements(), title="True Class")
plt.grid(True)
plt.show()

In [None]:
df.columns

In [None]:
features = ['First moons feature', 'Second moons feature', 'Vertical displacement']
target = 'class'

train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target])

dls = TabularDataLoaders.from_df(
    df=pd.concat([train_df, valid_df]),
    path='.',
    procs=[Normalize],
    cat_names=[],
    cont_names=features,
    y_names=target,
    y_block=CategoryBlock(),
    valid_idx=list(range(len(train_df), len(train_df) + len(valid_df))),
    bs=16
)

learn = tabular_learner(
    dls,
    layers=[256, 128, 64, 32],
    config=tabular_config(ps=[0.4, 0.3, 0.2, 0.1]),
    metrics=accuracy
)

lr_min = learn.lr_find().valley
learn.fit_one_cycle(20, lr_max=lr_min)

preds, targs = learn.get_preds()
pred_labels = preds.argmax(dim=1)

print(confusion_matrix(targs, pred_labels))
print(classification_report(targs, pred_labels))

In [None]:
features = ['First moons feature', 'Second moons feature', 'Vertical displacement']
f0, f1 = features[0], features[1]
fixed_f2 = df['Vertical displacement'].mean()

x_min, x_max = df[f0].min() - 0.1, df[f0].max() + 0.1
y_min, y_max = df[f1].min() - 0.1, df[f1].max() + 0.1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                     np.linspace(y_min, y_max, 300))

grid = np.c_[
    xx.ravel(),
    yy.ravel(),
    np.full(xx.ravel().shape, fixed_f2)
]
grid_df = pd.DataFrame(grid, columns=features)

test_dl = learn.dls.test_dl(grid_df)
preds, _ = learn.get_preds(dl=test_dl)
Z = preds.argmax(dim=1).reshape(xx.shape)

plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, cmap='Pastel1', alpha=0.6)

scatter = plt.scatter(df[f0], df[f1], c=df['class'], cmap='tab10', edgecolor='k')
plt.xlabel(f0)
plt.ylabel(f1)
plt.title("Decision Boundary (Fastai Learner)")
plt.legend(*scatter.legend_elements(), title="True Class")
plt.grid(True)
plt.tight_layout()
plt.show()

# PP5/6 Deep Learning on a Non-Linearly Separable Moons Dataset

## Project Overview

This project explores deep learning for binary classification using a **non-linearly separable dataset**. The dataset was designed to simulate a challenging decision boundary and was modeled using **two deep learning frameworks**: **PyTorch** and **fastai**.

The main objective was to train a neural network to distinguish between two classes using three non-linearly correlated features. The task includes full data preprocessing, visualization, neural network implementation and training, evaluation, and visual confirmation of decision boundaries.

---

## Dataset Description

The dataset used is based on the classic `make_moons` structure, extended with a third feature to introduce further vertical displacement. It consists of the following columns:

- **X1**: First moons feature  
- **X2**: Second moons feature  
- **X3**: Vertical displacement (added noise/offset to increase complexity)  
- **label**: Target class (0 or 1), where 1 indicates "Yes" and 0 indicates "No"

### Key Characteristics:

- 3 numerical input features  
- Binary classification target  
- Non-linear decision boundary  
- 1,000+ samples with no duplicates  
- Well-balanced class distribution

The dataset was thoroughly analyzed using boxplots, histograms, and a correlation matrix to understand the relationships between features and to confirm class separability is non-trivial.

---

## Methodology

Two separate deep learning pipelines were built:

### 1. PyTorch
- Manual dataset and dataloader creation
- Custom `nn.Module` with 3 hidden layers (ReLU activations)
- Binary cross-entropy loss (`BCELoss`)
- Trained using Adam optimizer
- Model performance monitored via accuracy and confusion matrix
- 3D decision surface plotted using `matplotlib`

### 2. fastai
- Used `TabularPandas` to preprocess and encode data
- Created a learner with `CrossEntropyLossFlat`
- Trained with built-in training loop (`fit_one_cycle`)
- Visualized classification performance and generated predictions

Both frameworks followed the same data split:  
- 60% training  
- 20% validation  
- 20% testing

---

## Key Findings

- The dataset poses a non-trivial classification challenge due to the curved and overlapping feature space.
- Both models achieved high accuracy (>95%) on the test set.
- PyTorch provided more control and transparency during training, especially for manual inspection of weights and predictions.
- fastai enabled rapid prototyping with minimal code and auto-handled much of the boilerplate preprocessing and training logic.
- Visual decision boundaries showed clear learned separation even in the presence of noise.

---

## Framework Comparison

Between the two frameworks, **fastai** was the most efficient for quick experimentation and high-level abstraction. Its API allowed for compact, readable code and fast training. I especially appreciated how fastai handled data preprocessing, splitting, and training loops with just a few lines of code, making it ideal for rapid iteration and model comparison.

On the other hand, **PyTorch** gave full control over model architecture, weight updates, and custom metrics. It was an excellent framework for learning and debugging deep learning logic from the ground up. It required more code, but it made every training step explicit and transparent.

That said, for real-world tasks where productivity and speed matter, I found **fastai** to be the better choice overall—especially for tabular or structured data classification tasks like this one. However, when deep customization or flexibility is required, **PyTorch** remains unmatched. Each framework has its place, and using both has helped me appreciate their strengths in different scenarios.

---

### Requirements

**requirements.txt** :
```
python == 3.10.18
pandas == 2.3.1
matplotlib == 3.10.0
seaborn == 0.13.2
numpy == 1.26.4
scikit-learn == 1.7.1
pytorch == 2.2.2
fastai == 2.7.17
```