.NET: Bug: Calls to PTU 4o-mini instance hanging / exceeding 10 seconds

**Describe the bug**  
Calls to Azure OpenAI using a Provisioned Throughput Unit (PTU) for the `gpt-4o-mini` model intermittently hang for up to 10+ seconds before failing with a timeout. These are low-token requests (<100 total tokens) and do not consistently reproduce, but the latency spikes are severe enough to trigger client-side timeouts and significantly degrade reliability.

The issue persists despite:
- Using `SocketsHttpHandler` with `PooledConnectionLifetime = 1 minute`
- Applying Polly retry and timeout strategies
- Rebuilding `SemanticKernel` and `HttpClient` per request
- Using dedicated PTU resources (no quota errors)

---

**To Reproduce**  
Steps to reproduce the behavior:
1. Deploy Azure OpenAI with a provisioned GPT-4o Mini deployment (`gpt-4o-mini`)
2. Use Semantic Kernel to call `GetChatMessageContentAsync(...)` on low-token prompts
3. Wrap the call in a timeout of 3–5 seconds
4. Observe intermittent timeouts or long-running requests (>10s)
5. Logs show no content returned and cancelled sockets (`OperationCanceledException`, sometimes `SocketException`)

---

**Expected behavior**  
Consistently low-latency completions from a provisioned GPT-4o Mini deployment.

---

**Platform**  
- Language: C#  
- Source: NuGet package `Microsoft.SemanticKernel` latest version
- AI model: Azure OpenAI PTU — `gpt-4o-mini`  
- IDE: Rider  
- OS: Windows 11 / Linux (reproduced on both)  

--- 

**Snippets**

```
    static IAsyncPolicy<HttpResponseMessage> GetRetryPolicy() =>
        HttpPolicyExtensions
            .HandleTransientHttpError()
            .Or<TimeoutRejectedException>()
            .WaitAndRetryAsync(3, _ => TimeSpan.Zero);

    static IAsyncPolicy<HttpResponseMessage> GetTimeoutPolicy() =>
        Policy.TimeoutAsync<HttpResponseMessage>(TimeSpan.FromSeconds(3));

    public static IServiceCollection AddApplicationServices(this IServiceCollection services)
    {
        services.AddHttpClient();

        services.AddSingleton<OpenAiRateLimitHandler>();
        services.AddHttpClient(nameof(OpenAiClient))
            .ConfigurePrimaryHttpMessageHandler(() => new SocketsHttpHandler
            {
                PooledConnectionLifetime = TimeSpan.FromMinutes(1),
                ConnectTimeout = TimeSpan.FromSeconds(2),
                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
                MaxConnectionsPerServer = 50
            })
            .AddPolicyHandler(GetRetryPolicy())
            .AddPolicyHandler(GetTimeoutPolicy())
            .AddHttpMessageHandler<OpenAiRateLimitHandler>();
```

```
public sealed class OpenAiClient : IOpenAiClient
{
    private readonly Kernel _kernel;
    private readonly ILogger<OpenAiClient> _logger;
    private readonly List<IPlugin> _plugins;

    private static readonly JsonSerializerOptions JsonOpts = new()
    {
        PropertyNameCaseInsensitive = true,
        Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
    };
    
    private static OpenAIPromptExecutionSettings Exec(string name, BinaryData schema) => new()
    {
        Temperature = 0f,
        ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat(name, schema, jsonSchemaIsStrict: false),
        FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
    };

    private static readonly OpenAIPromptExecutionSettings NonJsonExec = new()
    {
        Temperature = 0f,
        FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
    };
    
    private static readonly OpenAIPromptExecutionSettings ManualExec = new()
    {
        Temperature = 0f,
        ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat("Tool", BinaryData.FromString(UnhydratedToolCall.JsonSchema), jsonSchemaIsStrict: false),
        FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
    };

    public OpenAiClient(IEnumerable<IPlugin> plugins,
        IOptions<OpenAISettings> cfg,
        ILogger<OpenAiClient> logger,
        IHttpClientFactory httpFactory)
    {
        _logger = logger;
        _kernel = BuildKernel(cfg.Value, httpFactory.CreateClient(nameof(OpenAiClient)));
        _plugins = plugins.ToList();
    }

    public async Task<OpenAiResponse<string>> Generate(ChatHistory hist, CancellationToken ct)
    {
        var service = _kernel.GetRequiredService<IChatCompletionService>();
        var response = await service.GetChatMessageContentAsync(hist, NonJsonExec, _kernel, ct);
        var tokensUsed = GetTokensUsed(response);
        var content = response.Content;
        _logger.LogInformation("Chat usage: {Tokens}", tokensUsed);

        return new OpenAiResponse<string>(content, tokensUsed, []);
    }

    public async Task<OpenAiResponse<TOut>> Generate<TOut>(string convoId, ChatHistory hist, BinaryData schema, PluginInfo pluginInfo, CancellationToken ct)
    {
        // -----------------------------------------------------------------
        // ①  Prepare kernel and register only requested plugins
        // -----------------------------------------------------------------
        var kernel = _kernel.Clone();
        kernel.Data["conversationId"] = convoId;

        var plugins = pluginInfo.Plugins.Select(x => _plugins.FirstOrDefault(p => p.Name == x))
            .Where(x => x is not null)
            .Cast<IPlugin>()
            .ToList();

        var pluginKernel = kernel.Clone();
        foreach (var p in plugins)
        {
            pluginKernel.Plugins.AddFromObject(p, p.GetType().Name);
        }

        // -----------------------------------------------------------------
        // ②  ROUTING PASS  (autoInvoke:false)
        //     Ask the LLM which of those plugins it actually wants to call
        // -----------------------------------------------------------------
        var systemPrompt = PluginRouter.CreateSystemPrompt(plugins);

        var routerHist = new ChatHistory();
        routerHist.AddSystemMessage(systemPrompt);
        routerHist.AddUserMessage(pluginInfo.Prompt);

        var svc = kernel.GetRequiredService<IChatCompletionService>();
        var routingMsg = await svc.GetChatMessageContentAsync(routerHist, ManualExec, kernel, ct);
        var callsToMake = JsonSerializer.Deserialize<UnhydratedToolCall>(routingMsg.Content.ExtractJson(), JsonOpts)?.Hydrated.ToList() ?? [];

        var callContentList = callsToMake.Select(x => new FunctionCallContent(x.FunctionName, x.PluginName, x.Id)).ToList();
        var callResultList = new List<ChatMessageContent>();
         
        foreach (var call in callsToMake)
        {
            var matchingFn = pluginKernel.Plugins.TryGetFunction(call.PluginName, call.FunctionName, out var fn) ? fn : null;
            if (matchingFn is null)
            {
                _logger.LogWarning("No matching function found for call: {Call}", call);
                callContentList.RemoveAll(x => x.Id == call.Id);
                continue;
            }

            FunctionResult result;
            try 
            {
                result = await matchingFn.InvokeAsync(kernel, new KernelArguments(call.Arguments), ct);
            }
            catch (Exception ex)
            {
                result = new FunctionResult(matchingFn, value: $"Function call failed: {ex.Message}");
            }

            var resultContent = new FunctionResultContent(call.FunctionName, call.PluginName, call.Id, result.GetValue<object?>());
            callResultList.Add(new ChatMessageContent(AuthorRole.Tool, [resultContent])
            {
                Content = JsonSerializer.Serialize(result.GetValue<object>(), JsonOpts),
            });
        }

        if (callsToMake.Count != 0)
        {
            var callContent = new ChatMessageContent
            {
                Items = [..callContentList],
                Metadata = new Dictionary<string, object?>
                {
                    { "tool_calls", callContentList },
                },
                Role = AuthorRole.Assistant,
                Content = string.Empty
            };
             
            hist.Add(callContent);
            hist.AddRange(callResultList);
        }
         
        var routerTokensUsed = GetTokensUsed(routingMsg);
        _logger.LogInformation("Plugin usage: {Usage}", routerTokensUsed);
         
        // -----------------------------------------------------------------
        // ④  MAIN PASS  (standard Exec settings)
        // -----------------------------------------------------------------
        var response = await svc.GetChatMessageContentAsync(hist, Exec(typeof(TOut).Name, schema), kernel, ct);
        var json = response.Content.ExtractJson();
        var tokensUsed = GetTokensUsed(response);
        var toolCalls = hist.Where(x => x.Role == AuthorRole.Tool).Select(x => x.Content ?? string.Empty).Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
        _logger.LogInformation("Chat usage: {Usage}", tokensUsed);
 
        return string.IsNullOrWhiteSpace(json) 
            ? new OpenAiResponse<TOut>(default, tokensUsed, [])
            : new OpenAiResponse<TOut>(JsonSerializer.Deserialize<TOut>(json, JsonOpts), tokensUsed, toolCalls);
    }

    private static Kernel BuildKernel(OpenAISettings cfg, HttpClient http)
    {
        var b = Kernel.CreateBuilder();
        b.Services.AddLogging(l => l.SetMinimumLevel(LogLevel.Information));

        if (string.IsNullOrWhiteSpace(cfg.Key))
        {
            b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, new DefaultAzureCredential(), httpClient: http);
        }
        else
        {
            b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, cfg.Key, httpClient: http);
        }

        return b.Build();
    }

    private static int GetTokensUsed(ChatMessageContent content)
    {
        if (content.Metadata is not { } meta)
        {
            return 0;
        }

        if (meta.TryGetValue("Usage", out var usage) && usage is ChatTokenUsage tokenUsage)
        {
            return tokenUsage.TotalTokenCount;
        }

        return 0;
    }
}
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

.NET: Bug: Calls to PTU 4o-mini instance hanging / exceeding 10 seconds #12484

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

.NET: Bug: Calls to PTU 4o-mini instance hanging / exceeding 10 seconds #12484

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions