In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.datasets import make_blobs
import numpy as np
import os

In [2]:
#make syntetic data

# Parameters for data generation
n_samples = 100  # Number of data points
n_features = 768  # Dimensionality of each data point
centers = 3       # Number of clusters
cluster_std = 5.0 # Standard deviation to add noise to the clusters
noise_level = 30 # Adjust this to control the amount of noise

# Generate synthetic data
X, y = make_blobs(n_samples=n_samples, 
                  n_features=n_features, 
                  centers=centers, 
                  cluster_std=cluster_std, 
                  random_state=42)

# Adding noise to the data
noise = np.random.normal(0, noise_level, X.shape)
X_noisy = X + noise


In [5]:
# Define the neural network architecture
class TreeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TreeClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.softmax(self.layer2(x))
        return x

# Custom dataset class
class TreeDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Function to train the model
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Function to evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for embeddings, labels in test_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

class TreeClassifierInterface:
    def __init__(self, input_dim=768, hidden_dim=64, output_dim=3):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = TreeClassifier(input_dim, hidden_dim, output_dim).to(self.device)
        self.class_names = ['good tree', 'ill tree', 'garbage']

    def train_and_save_model(self, embeddings, labels, model_path, n_splits=5, batch_size=16, num_epochs=50, learning_rate=0.001):
        """
        Train the model using cross-validation and save the best model.

        Args:
        embeddings (numpy.ndarray): The input embeddings.
        labels (numpy.ndarray): The corresponding labels.
        model_path (str): Path to save the best model.
        n_splits (int): Number of splits for cross-validation.
        batch_size (int): Batch size for training.
        num_epochs (int): Number of epochs for training.
        learning_rate (float): Learning rate for the optimizer.
        """
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        embeddings_tensor = torch.FloatTensor(embeddings)
        labels_tensor = torch.LongTensor(labels)
        
        best_accuracy = 0
        best_model = None
        fold_accuracies = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(embeddings_tensor)):
            print(f"Fold {fold + 1}/{n_splits}")
            
            train_dataset = TreeDataset(embeddings_tensor[train_idx], labels_tensor[train_idx])
            val_dataset = TreeDataset(embeddings_tensor[val_idx], labels_tensor[val_idx])
            
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
            
            model = TreeClassifier(self.input_dim, self.hidden_dim, self.output_dim).to(self.device)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            
            for epoch in range(num_epochs):
                train_model(model, train_loader, criterion, optimizer, self.device)
            
            accuracy = evaluate_model(model, val_loader, self.device)
            fold_accuracies.append(accuracy)
            print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model.state_dict()
        
        print(f"Cross-validation complete. Average accuracy: {np.mean(fold_accuracies):.4f} (+/- {np.std(fold_accuracies):.4f})")
        print()
        print(f"Best accuracy: {best_accuracy:.4f}")
        self.model.load_state_dict(best_model)
        self.save_model(model_path)

    def save_model(self, path):
        """Save the model weights to a file."""
        torch.save(self.model.state_dict(), path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        """Load the model weights from a file."""
        if os.path.exists(path):
            self.model.load_state_dict(torch.load(path, map_location=self.device))
            self.model.eval()
            print(f"Model loaded from {path}")
        else:
            print(f"No saved model found at {path}")

    def predict(self, embeddings):
        """Predict the class of new unseen data."""
        self.model.eval()
        with torch.no_grad():
            embeddings_tensor = torch.FloatTensor(embeddings).to(self.device)
            outputs = self.model(embeddings_tensor)
            probabilities, predicted = torch.max(outputs, 1)
            predictions = predicted.cpu().numpy().tolist()
            probabilities = probabilities.cpu().numpy().tolist()
        return predictions, probabilities

    def predict_with_class_names(self, embeddings):
        """Predict the class of new unseen data and return class names."""
        predictions, probabilities = self.predict(embeddings)
        return [(self.class_names[pred], prob) for pred, prob in zip(predictions, probabilities)]

In [None]:
def main():
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Generate dummy data (replace with your actual data)
    num_samples = 200
    embeddings = X_noisy
    labels = y

    # Initialize the classifier interface
    classifier = TreeClassifierInterface()

    # Train and save the model
    classifier.train_and_save_model(embeddings, labels, 'best_tree_classifier_model.pth')

    # Load the saved model (in a real scenario, you might do this in a separate script or session)
    classifier.load_model('best_tree_classifier_model.pth')

    # Generate new dummy embeddings for prediction (replace with your actual new data)
    new_embeddings = np.random.randn(5, 768)  # 5 new samples

    # Make predictions
    predictions = classifier.predict_with_class_names(new_embeddings)

    # Print results
    print("\nPredictions for new data:")
    for i, (class_name, probability) in enumerate(predictions):
        print(f"Sample {i + 1}: Predicted class: {class_name}, Probability: {probability:.4f}")

if __name__ == "__main__":
    main()