In [2]:
#r "nuget: System.Text.Json"
#r "nuget: Microsoft.ML"
#r "nuget: Azure.AI.OpenAI, 1.0.0-beta.12"
#r "nuget: DotNetEnv, 2.5.0"

In [3]:
using Microsoft.ML;
using Microsoft.ML.Data;
using System.Text.Json;
using System.IO;
using System.Text.Json.Serialization;
using System.Linq;
using Azure; 
using Azure.AI.OpenAI;
using DotNetEnv;

In [4]:
// loading the csv feedback record class
# load "./FeedbackRecord.cs"
# load "./ServiceCluster.cs"

In [None]:
public static double CalculateCosineSimilarity(float[] vectorA, float[] vectorB)
{
    double dotProduct = 0;
    double magnitudeA = 0;
    double magnitudeB = 0;

    for (int i = 0; i < vectorA.Length; i++)
    {
        dotProduct += vectorA[i] * vectorB[i];
        magnitudeA += Math.Pow(vectorA[i], 2);
        magnitudeB += Math.Pow(vectorB[i], 2);
    }

    magnitudeA = Math.Sqrt(magnitudeA);
    magnitudeB = Math.Sqrt(magnitudeB);

    return dotProduct / (magnitudeA * magnitudeB);
}

In [None]:
public List<List<FeedbackRecord>> CreateSubClustersWithKMeans(List<FeedbackRecord> feedbackRecords, int subClusterCount)
{
    var mlContext = new MLContext();
    var embeddingData = feedbackRecords.Select(f => new EmbeddingData { Embedding = f.Embedding }).ToList();
    var dataView = mlContext.Data.LoadFromEnumerable(embeddingData);

    // Use K-Means to create sub-clusters
    var pipeline = mlContext.Clustering.Trainers.KMeans(featureColumnName: "Embedding", numberOfClusters: subClusterCount);
    var model = pipeline.Fit(dataView);

    var predictions = model.Transform(dataView);
    var clusters = mlContext.Data.CreateEnumerable<ClusterPrediction>(predictions, reuseRowObject: false).ToList();

    // Assign feedback records to their sub-clusters
    return feedbackRecords
        .Zip(clusters, (feedback, cluster) => (Feedback: feedback, ClusterId: cluster.PredictedCluster))
        .GroupBy(fc => fc.ClusterId)
        .Select(g => g.Select(fc => fc.Feedback).ToList())
        .ToList();
}

In [None]:
public List<List<FeedbackRecord>> CreateSubClustersWithHAC(List<FeedbackRecord> feedbackRecords, double similarityThreshold)
{
    // Initially, each feedback record is its own cluster
    var clusters = feedbackRecords.Select(r => new List<FeedbackRecord> { r }).ToList();

    while (true)
    {
        double maxSimilarity = double.MinValue;
        int mergeIndex1 = -1;
        int mergeIndex2 = -1;

        // Find the most similar pair of clusters
        for (int i = 0; i < clusters.Count; i++)
        {
            for (int j = i + 1; j < clusters.Count; j++)
            {
                double similarity = CalculateAverageCosineSimilarity(clusters[i], clusters[j]);
                if (similarity > maxSimilarity)
                {
                    maxSimilarity = similarity;
                    mergeIndex1 = i;
                    mergeIndex2 = j;
                }
            }
        }

        // Stop merging if the highest similarity is below the threshold
        if (maxSimilarity < similarityThreshold)
        {
            break;
        }

        // Merge the two most similar clusters
        clusters[mergeIndex1].AddRange(clusters[mergeIndex2]);
        clusters.RemoveAt(mergeIndex2);
    }

    return clusters;
}

// Helper method to calculate the average similarity between two clusters
private double CalculateAverageCosineSimilarity(List<FeedbackRecord> cluster1, List<FeedbackRecord> cluster2)
{
    double totalSimilarity = 0;
    int comparisons = 0;

    foreach (var record1 in cluster1)
    {
        foreach (var record2 in cluster2)
        {
            totalSimilarity += CalculateCosineSimilarity(record1.Embedding, record2.Embedding);
            comparisons++;
        }
    }

    return totalSimilarity / comparisons;
}

In [None]:
// Sub-clustering method, as discussed
public List<List<FeedbackRecord>> CreateSubClusters(List<FeedbackRecord> feedbackRecords, double threshold)
{
    var subClusters = new List<List<FeedbackRecord>>();
    var unassignedRecords = new HashSet<FeedbackRecord>(feedbackRecords);

    while (unassignedRecords.Any())
    {
        var seed = unassignedRecords.First();
        unassignedRecords.Remove(seed);

        var currentCluster = new List<FeedbackRecord> { seed };

        foreach (var record in unassignedRecords.ToList())
        {
            double similarity = CalculateCosineSimilarity(seed.Embedding, record.Embedding);
            if (similarity >= threshold)
            {
                currentCluster.Add(record);
                unassignedRecords.Remove(record);
            }
        }

        subClusters.Add(currentCluster);
    }

    return subClusters;
}

In [None]:
// Define your cosine similarity threshold for sub-clustering
double cosineThreshold = 0.862;

// Iterate over each main cluster in clusterList and apply sub-clustering
foreach (var cluster in clusterList)
{
    Console.WriteLine($"Processing sub-clustering for main cluster {cluster.ClusterId} with {cluster.FeedbackRecords.Count} items");

    // Run sub-clustering within each main cluster
    // var subClusters = CreateSubClusters(cluster.FeedbackRecords, cosineThreshold);
    // var kMeansSubClusters = CreateSubClustersWithKMeans(cluster.FeedbackRecords, 10);
    var hacSubClusters = CreateSubClustersWithHAC(cluster.FeedbackRecords, cosineThreshold);

    // Console.WriteLine($"Main Cluster {cluster.ClusterId} has {subClusters.Count} greedy-sub-clusters, and {kMeansSubClusters.Count} KMeans and {hacSubClusters.Count} HAC .");
    Console.WriteLine($"Main Cluster {cluster.ClusterId} has  {hacSubClusters.Count} HAC based clusters .");

    // Enhance the main cluster with sub-clusters (if needed)
    cluster.SubClusters = hacSubClusters;
}

In [None]:
private double CalculateAverageSimilarity(List<FeedbackRecord> feedbackRecords)
{
    // Example calculation of average similarity between feedback items in a sub-cluster.
    // Adjust this to match how similarity is measured in your context.
    
    double totalSimilarity = 0;
    int count = 0;

    for (int i = 0; i < feedbackRecords.Count; i++)
    {
        for (int j = i + 1; j < feedbackRecords.Count; j++)
        {
            // Calculate similarity between two embeddings (e.g., cosine similarity)
            double similarity = CalculateCosineSimilarity(feedbackRecords[i].Embedding, feedbackRecords[j].Embedding);
            totalSimilarity += similarity;
            count++;
        }
    }
    return count > 0 ? totalSimilarity / count : 0;
}

In [None]:
// Filter main clusters with more than one sub-cluster
var clustersWithMultipleSubClusters = clusterList
    .Where(mainCluster => mainCluster.SubClusters != null && mainCluster.SubClusters.Count > 1)
    .ToList();

In [None]:
string thematicMessage = @"
“Given the following summary, create one concise, overarching statement that captures the main theme or purpose described. 
Focus on summarizing the core idea in a single short sentence.";

In [None]:
foreach (var mainCluster in clustersWithMultipleSubClusters)
{
    var openAIResponse = await CallOpenAI(mainCluster.Summary, thematicMessage, JasonResponse : false);
    Console.WriteLine($"{mainCluster.ClusterId} with summary {mainCluster.Summary} \n with: {openAIResponse}");
}

In [None]:
// load full cluster from a file
var fullClusterFilePath = $"{dataRoot}/fabric-clusters-full.json";
Console.WriteLine($"Loading full clusters from file: {fullClusterFilePath}");
var clusterList = await LoadClustersFromFile(fullClusterFilePath);

In [None]:


// Log information about each main cluster that has multiple sub-clusters
foreach (var mainCluster in clustersWithMultipleSubClusters)
{
    

    Console.WriteLine($"Main Cluster {openAIResponse} with {mainCluster.FeedbackRecords.Count} items and {mainCluster.SubClusters.Count} sub-clusters.");
    
    foreach (var subCluster in mainCluster.SubClusters)
    {
        Console.WriteLine("  ------------------");
        Console.WriteLine($"  - Sub-Cluster with {subCluster.Count} feedback items.");

        // Display a summary or statistics for each sub-cluster if desired
        var avgSimilarity = CalculateAverageSimilarity(subCluster);
        Console.WriteLine($"    Average Similarity within Sub-Cluster: {avgSimilarity:F2}");
        
        
        // print only id the number of feedbacks is more than 1
        if (subCluster.Count > 1)
        {
            foreach (var feedback in subCluster) 
            {
                Console.WriteLine($"    Sample Feedback:{feedback.Id}|| {feedback.UserStory} ||");
            }
        }


    }
}

In [None]:
// Method to load clusters from a JSON file
public async Task<List<ServiceCluster>> LoadClustersFromFile(string filePath)
{
    try
    {
        if (!File.Exists(filePath))
        {
            throw new FileNotFoundException("File not found.", filePath);
        }

        // Read JSON from file and deserialize to List<ServiceCluster>
        string json = await File.ReadAllTextAsync(filePath);
        var clusters = JsonSerializer.Deserialize<List<ServiceCluster>>(json);

        Console.WriteLine($"Clusters loaded from {filePath}");
        return clusters ?? new List<ServiceCluster>();
    }
    catch (Exception ex)
    {
        Console.WriteLine($"Error loading clusters from file: {ex.Message}");
        return new List<ServiceCluster>();  // Return empty list if an error occurs
    }
}

In [None]:
var options = new JsonSerializerOptions
{
    PropertyNameCaseInsensitive = true
};

    List<ServiceCluster> clusters = JsonSerializer.Deserialize<List<ServiceCluster>>(jsonString, options);

    // Process each cluster to generate initiative ideas
    foreach (var cluster in clusters)
    {
        // Extract necessary information
        string clusterId = cluster.ClusterId;
        string commonElement = cluster.CommonElement;
        int similarFeedbacks = cluster.SimilarFeedbacks;
        int distinctCustomers = cluster.DistinctCustomers;
        string summary = cluster.Summary;

        // Generate initiative idea
        string initiativeIdea = $"Initiative Idea for Cluster {clusterId}:\n" +
            $"- **Focus Area**: {commonElement}\n" +
            $"- **Description**: {summary}\n" +
            $"- **Potential Impact**: Addresses feedback from {similarFeedbacks} similar feedback items across {distinctCustomers} customers.\n";

        // Output the initiative idea
        Console.WriteLine(initiativeIdea);
        Console.WriteLine(new string('*', 50));
    }
