Skip to content

.NET: Bug: Calls to PTU 4o-mini instance hanging / exceeding 10 seconds #12484

Closed
@dominic-codespoti

Description

@dominic-codespoti

Describe the bug
Calls to Azure OpenAI using a Provisioned Throughput Unit (PTU) for the gpt-4o-mini model intermittently hang for up to 10+ seconds before failing with a timeout. These are low-token requests (<100 total tokens) and do not consistently reproduce, but the latency spikes are severe enough to trigger client-side timeouts and significantly degrade reliability.

The issue persists despite:

  • Using SocketsHttpHandler with PooledConnectionLifetime = 1 minute
  • Applying Polly retry and timeout strategies
  • Rebuilding SemanticKernel and HttpClient per request
  • Using dedicated PTU resources (no quota errors)

To Reproduce
Steps to reproduce the behavior:

  1. Deploy Azure OpenAI with a provisioned GPT-4o Mini deployment (gpt-4o-mini)
  2. Use Semantic Kernel to call GetChatMessageContentAsync(...) on low-token prompts
  3. Wrap the call in a timeout of 3–5 seconds
  4. Observe intermittent timeouts or long-running requests (>10s)
  5. Logs show no content returned and cancelled sockets (OperationCanceledException, sometimes SocketException)

Expected behavior
Consistently low-latency completions from a provisioned GPT-4o Mini deployment.


Platform

  • Language: C#
  • Source: NuGet package Microsoft.SemanticKernel latest version
  • AI model: Azure OpenAI PTU — gpt-4o-mini
  • IDE: Rider
  • OS: Windows 11 / Linux (reproduced on both)

Snippets

    static IAsyncPolicy<HttpResponseMessage> GetRetryPolicy() =>
        HttpPolicyExtensions
            .HandleTransientHttpError()
            .Or<TimeoutRejectedException>()
            .WaitAndRetryAsync(3, _ => TimeSpan.Zero);

    static IAsyncPolicy<HttpResponseMessage> GetTimeoutPolicy() =>
        Policy.TimeoutAsync<HttpResponseMessage>(TimeSpan.FromSeconds(3));

    public static IServiceCollection AddApplicationServices(this IServiceCollection services)
    {
        services.AddHttpClient();

        services.AddSingleton<OpenAiRateLimitHandler>();
        services.AddHttpClient(nameof(OpenAiClient))
            .ConfigurePrimaryHttpMessageHandler(() => new SocketsHttpHandler
            {
                PooledConnectionLifetime = TimeSpan.FromMinutes(1),
                ConnectTimeout = TimeSpan.FromSeconds(2),
                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
                MaxConnectionsPerServer = 50
            })
            .AddPolicyHandler(GetRetryPolicy())
            .AddPolicyHandler(GetTimeoutPolicy())
            .AddHttpMessageHandler<OpenAiRateLimitHandler>();
public sealed class OpenAiClient : IOpenAiClient
{
    private readonly Kernel _kernel;
    private readonly ILogger<OpenAiClient> _logger;
    private readonly List<IPlugin> _plugins;

    private static readonly JsonSerializerOptions JsonOpts = new()
    {
        PropertyNameCaseInsensitive = true,
        Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
    };
    
    private static OpenAIPromptExecutionSettings Exec(string name, BinaryData schema) => new()
    {
        Temperature = 0f,
        ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat(name, schema, jsonSchemaIsStrict: false),
        FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
    };

    private static readonly OpenAIPromptExecutionSettings NonJsonExec = new()
    {
        Temperature = 0f,
        FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
    };
    
    private static readonly OpenAIPromptExecutionSettings ManualExec = new()
    {
        Temperature = 0f,
        ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat("Tool", BinaryData.FromString(UnhydratedToolCall.JsonSchema), jsonSchemaIsStrict: false),
        FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
    };

    public OpenAiClient(IEnumerable<IPlugin> plugins,
        IOptions<OpenAISettings> cfg,
        ILogger<OpenAiClient> logger,
        IHttpClientFactory httpFactory)
    {
        _logger = logger;
        _kernel = BuildKernel(cfg.Value, httpFactory.CreateClient(nameof(OpenAiClient)));
        _plugins = plugins.ToList();
    }

    public async Task<OpenAiResponse<string>> Generate(ChatHistory hist, CancellationToken ct)
    {
        var service = _kernel.GetRequiredService<IChatCompletionService>();
        var response = await service.GetChatMessageContentAsync(hist, NonJsonExec, _kernel, ct);
        var tokensUsed = GetTokensUsed(response);
        var content = response.Content;
        _logger.LogInformation("Chat usage: {Tokens}", tokensUsed);

        return new OpenAiResponse<string>(content, tokensUsed, []);
    }

    public async Task<OpenAiResponse<TOut>> Generate<TOut>(string convoId, ChatHistory hist, BinaryData schema, PluginInfo pluginInfo, CancellationToken ct)
    {
        // -----------------------------------------------------------------
        // ①  Prepare kernel and register only requested plugins
        // -----------------------------------------------------------------
        var kernel = _kernel.Clone();
        kernel.Data["conversationId"] = convoId;

        var plugins = pluginInfo.Plugins.Select(x => _plugins.FirstOrDefault(p => p.Name == x))
            .Where(x => x is not null)
            .Cast<IPlugin>()
            .ToList();

        var pluginKernel = kernel.Clone();
        foreach (var p in plugins)
        {
            pluginKernel.Plugins.AddFromObject(p, p.GetType().Name);
        }

        // -----------------------------------------------------------------
        // ②  ROUTING PASS  (autoInvoke:false)
        //     Ask the LLM which of those plugins it actually wants to call
        // -----------------------------------------------------------------
        var systemPrompt = PluginRouter.CreateSystemPrompt(plugins);

        var routerHist = new ChatHistory();
        routerHist.AddSystemMessage(systemPrompt);
        routerHist.AddUserMessage(pluginInfo.Prompt);

        var svc = kernel.GetRequiredService<IChatCompletionService>();
        var routingMsg = await svc.GetChatMessageContentAsync(routerHist, ManualExec, kernel, ct);
        var callsToMake = JsonSerializer.Deserialize<UnhydratedToolCall>(routingMsg.Content.ExtractJson(), JsonOpts)?.Hydrated.ToList() ?? [];

        var callContentList = callsToMake.Select(x => new FunctionCallContent(x.FunctionName, x.PluginName, x.Id)).ToList();
        var callResultList = new List<ChatMessageContent>();
         
        foreach (var call in callsToMake)
        {
            var matchingFn = pluginKernel.Plugins.TryGetFunction(call.PluginName, call.FunctionName, out var fn) ? fn : null;
            if (matchingFn is null)
            {
                _logger.LogWarning("No matching function found for call: {Call}", call);
                callContentList.RemoveAll(x => x.Id == call.Id);
                continue;
            }

            FunctionResult result;
            try 
            {
                result = await matchingFn.InvokeAsync(kernel, new KernelArguments(call.Arguments), ct);
            }
            catch (Exception ex)
            {
                result = new FunctionResult(matchingFn, value: $"Function call failed: {ex.Message}");
            }

            var resultContent = new FunctionResultContent(call.FunctionName, call.PluginName, call.Id, result.GetValue<object?>());
            callResultList.Add(new ChatMessageContent(AuthorRole.Tool, [resultContent])
            {
                Content = JsonSerializer.Serialize(result.GetValue<object>(), JsonOpts),
            });
        }

        if (callsToMake.Count != 0)
        {
            var callContent = new ChatMessageContent
            {
                Items = [..callContentList],
                Metadata = new Dictionary<string, object?>
                {
                    { "tool_calls", callContentList },
                },
                Role = AuthorRole.Assistant,
                Content = string.Empty
            };
             
            hist.Add(callContent);
            hist.AddRange(callResultList);
        }
         
        var routerTokensUsed = GetTokensUsed(routingMsg);
        _logger.LogInformation("Plugin usage: {Usage}", routerTokensUsed);
         
        // -----------------------------------------------------------------
        // ④  MAIN PASS  (standard Exec settings)
        // -----------------------------------------------------------------
        var response = await svc.GetChatMessageContentAsync(hist, Exec(typeof(TOut).Name, schema), kernel, ct);
        var json = response.Content.ExtractJson();
        var tokensUsed = GetTokensUsed(response);
        var toolCalls = hist.Where(x => x.Role == AuthorRole.Tool).Select(x => x.Content ?? string.Empty).Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
        _logger.LogInformation("Chat usage: {Usage}", tokensUsed);
 
        return string.IsNullOrWhiteSpace(json) 
            ? new OpenAiResponse<TOut>(default, tokensUsed, [])
            : new OpenAiResponse<TOut>(JsonSerializer.Deserialize<TOut>(json, JsonOpts), tokensUsed, toolCalls);
    }

    private static Kernel BuildKernel(OpenAISettings cfg, HttpClient http)
    {
        var b = Kernel.CreateBuilder();
        b.Services.AddLogging(l => l.SetMinimumLevel(LogLevel.Information));

        if (string.IsNullOrWhiteSpace(cfg.Key))
        {
            b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, new DefaultAzureCredential(), httpClient: http);
        }
        else
        {
            b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, cfg.Key, httpClient: http);
        }

        return b.Build();
    }

    private static int GetTokensUsed(ChatMessageContent content)
    {
        if (content.Metadata is not { } meta)
        {
            return 0;
        }

        if (meta.TryGetValue("Usage", out var usage) && usage is ChatTokenUsage tokenUsage)
        {
            return tokenUsage.TotalTokenCount;
        }

        return 0;
    }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    .NETIssue or Pull requests regarding .NET codebugSomething isn't workingjavaIssue or PR regarding Java codetriage

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions