In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.datasets import make_blobs
import numpy as np

  from . import _distributor_init


In [8]:
#make syntetic data

# Parameters for data generation
n_samples = 100  # Number of data points
n_features = 768  # Dimensionality of each data point
centers = 3       # Number of clusters
cluster_std = 5.0 # Standard deviation to add noise to the clusters
noise_level = 30 # Adjust this to control the amount of noise

# Generate synthetic data
X, y = make_blobs(n_samples=n_samples, 
                  n_features=n_features, 
                  centers=centers, 
                  cluster_std=cluster_std, 
                  random_state=42)

# Adding noise to the data
noise = np.random.normal(0, noise_level, X.shape)
X_noisy = X + noise


In [9]:

# Define the neural network architecture
class TreeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TreeClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.softmax(self.layer2(x))
        return x

# Custom dataset class
class TreeDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Function to train the model
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Function to evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for embeddings, labels in test_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Main function to run the training and evaluation
def main():
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Hyperparameters
    input_dim = 768  # Dimension of your embedding space
    hidden_dim = 64  # Number of neurons in the hidden layer
    output_dim = 3   # Number of classes (good tree, ill tree, garbage)
    learning_rate = 0.001
    batch_size = 16
    num_epochs = 50
    n_splits = 5  # Number of folds for cross-validation

    # Assuming you have your data in numpy arrays
    # embeddings: shape (n_samples, 768)
    # labels: shape (n_samples,) with values 0 (good tree), 1 (ill tree), 2 (garbage)
    embeddings = X_noisy  # Replace with your actual embeddings
    labels = y   # Replace with your actual labels

    # Convert data to PyTorch tensors
    embeddings_tensor = torch.FloatTensor(embeddings)
    labels_tensor = torch.LongTensor(labels)

    # Prepare cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Cross-validation loop
    fold_accuracies = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(embeddings_tensor)):
        print(f"Fold {fold + 1}/{n_splits}")

        # Create train and validation datasets
        train_dataset = TreeDataset(embeddings_tensor[train_idx], labels_tensor[train_idx])
        val_dataset = TreeDataset(embeddings_tensor[val_idx], labels_tensor[val_idx])

        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # Initialize the model, loss function, and optimizer
        model = TreeClassifier(input_dim, hidden_dim, output_dim).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Training loop
        for epoch in range(num_epochs):
            train_model(model, train_loader, criterion, optimizer, device)

        # Evaluate the model
        accuracy = evaluate_model(model, val_loader, device)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")

    # Print final results
    print(f"Cross-validation complete. Average accuracy: {np.mean(fold_accuracies):.4f} (+/- {np.std(fold_accuracies):.4f})")

if __name__ == "__main__":
    main()

Fold 1/5
Fold 1 Accuracy: 0.9000
Fold 2/5
Fold 2 Accuracy: 0.9000
Fold 3/5
Fold 3 Accuracy: 0.7500
Fold 4/5
Fold 4 Accuracy: 0.9500
Fold 5/5
Fold 5 Accuracy: 0.8000
Cross-validation complete. Average accuracy: 0.8600 (+/- 0.0735)
