Closed
Description
Describe the bug
Calls to Azure OpenAI using a Provisioned Throughput Unit (PTU) for the gpt-4o-mini
model intermittently hang for up to 10+ seconds before failing with a timeout. These are low-token requests (<100 total tokens) and do not consistently reproduce, but the latency spikes are severe enough to trigger client-side timeouts and significantly degrade reliability.
The issue persists despite:
- Using
SocketsHttpHandler
withPooledConnectionLifetime = 1 minute
- Applying Polly retry and timeout strategies
- Rebuilding
SemanticKernel
andHttpClient
per request - Using dedicated PTU resources (no quota errors)
To Reproduce
Steps to reproduce the behavior:
- Deploy Azure OpenAI with a provisioned GPT-4o Mini deployment (
gpt-4o-mini
) - Use Semantic Kernel to call
GetChatMessageContentAsync(...)
on low-token prompts - Wrap the call in a timeout of 3–5 seconds
- Observe intermittent timeouts or long-running requests (>10s)
- Logs show no content returned and cancelled sockets (
OperationCanceledException
, sometimesSocketException
)
Expected behavior
Consistently low-latency completions from a provisioned GPT-4o Mini deployment.
Platform
- Language: C#
- Source: NuGet package
Microsoft.SemanticKernel
latest version - AI model: Azure OpenAI PTU —
gpt-4o-mini
- IDE: Rider
- OS: Windows 11 / Linux (reproduced on both)
Snippets
static IAsyncPolicy<HttpResponseMessage> GetRetryPolicy() =>
HttpPolicyExtensions
.HandleTransientHttpError()
.Or<TimeoutRejectedException>()
.WaitAndRetryAsync(3, _ => TimeSpan.Zero);
static IAsyncPolicy<HttpResponseMessage> GetTimeoutPolicy() =>
Policy.TimeoutAsync<HttpResponseMessage>(TimeSpan.FromSeconds(3));
public static IServiceCollection AddApplicationServices(this IServiceCollection services)
{
services.AddHttpClient();
services.AddSingleton<OpenAiRateLimitHandler>();
services.AddHttpClient(nameof(OpenAiClient))
.ConfigurePrimaryHttpMessageHandler(() => new SocketsHttpHandler
{
PooledConnectionLifetime = TimeSpan.FromMinutes(1),
ConnectTimeout = TimeSpan.FromSeconds(2),
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
MaxConnectionsPerServer = 50
})
.AddPolicyHandler(GetRetryPolicy())
.AddPolicyHandler(GetTimeoutPolicy())
.AddHttpMessageHandler<OpenAiRateLimitHandler>();
public sealed class OpenAiClient : IOpenAiClient
{
private readonly Kernel _kernel;
private readonly ILogger<OpenAiClient> _logger;
private readonly List<IPlugin> _plugins;
private static readonly JsonSerializerOptions JsonOpts = new()
{
PropertyNameCaseInsensitive = true,
Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
};
private static OpenAIPromptExecutionSettings Exec(string name, BinaryData schema) => new()
{
Temperature = 0f,
ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat(name, schema, jsonSchemaIsStrict: false),
FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
};
private static readonly OpenAIPromptExecutionSettings NonJsonExec = new()
{
Temperature = 0f,
FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
};
private static readonly OpenAIPromptExecutionSettings ManualExec = new()
{
Temperature = 0f,
ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat("Tool", BinaryData.FromString(UnhydratedToolCall.JsonSchema), jsonSchemaIsStrict: false),
FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
};
public OpenAiClient(IEnumerable<IPlugin> plugins,
IOptions<OpenAISettings> cfg,
ILogger<OpenAiClient> logger,
IHttpClientFactory httpFactory)
{
_logger = logger;
_kernel = BuildKernel(cfg.Value, httpFactory.CreateClient(nameof(OpenAiClient)));
_plugins = plugins.ToList();
}
public async Task<OpenAiResponse<string>> Generate(ChatHistory hist, CancellationToken ct)
{
var service = _kernel.GetRequiredService<IChatCompletionService>();
var response = await service.GetChatMessageContentAsync(hist, NonJsonExec, _kernel, ct);
var tokensUsed = GetTokensUsed(response);
var content = response.Content;
_logger.LogInformation("Chat usage: {Tokens}", tokensUsed);
return new OpenAiResponse<string>(content, tokensUsed, []);
}
public async Task<OpenAiResponse<TOut>> Generate<TOut>(string convoId, ChatHistory hist, BinaryData schema, PluginInfo pluginInfo, CancellationToken ct)
{
// -----------------------------------------------------------------
// ① Prepare kernel and register only requested plugins
// -----------------------------------------------------------------
var kernel = _kernel.Clone();
kernel.Data["conversationId"] = convoId;
var plugins = pluginInfo.Plugins.Select(x => _plugins.FirstOrDefault(p => p.Name == x))
.Where(x => x is not null)
.Cast<IPlugin>()
.ToList();
var pluginKernel = kernel.Clone();
foreach (var p in plugins)
{
pluginKernel.Plugins.AddFromObject(p, p.GetType().Name);
}
// -----------------------------------------------------------------
// ② ROUTING PASS (autoInvoke:false)
// Ask the LLM which of those plugins it actually wants to call
// -----------------------------------------------------------------
var systemPrompt = PluginRouter.CreateSystemPrompt(plugins);
var routerHist = new ChatHistory();
routerHist.AddSystemMessage(systemPrompt);
routerHist.AddUserMessage(pluginInfo.Prompt);
var svc = kernel.GetRequiredService<IChatCompletionService>();
var routingMsg = await svc.GetChatMessageContentAsync(routerHist, ManualExec, kernel, ct);
var callsToMake = JsonSerializer.Deserialize<UnhydratedToolCall>(routingMsg.Content.ExtractJson(), JsonOpts)?.Hydrated.ToList() ?? [];
var callContentList = callsToMake.Select(x => new FunctionCallContent(x.FunctionName, x.PluginName, x.Id)).ToList();
var callResultList = new List<ChatMessageContent>();
foreach (var call in callsToMake)
{
var matchingFn = pluginKernel.Plugins.TryGetFunction(call.PluginName, call.FunctionName, out var fn) ? fn : null;
if (matchingFn is null)
{
_logger.LogWarning("No matching function found for call: {Call}", call);
callContentList.RemoveAll(x => x.Id == call.Id);
continue;
}
FunctionResult result;
try
{
result = await matchingFn.InvokeAsync(kernel, new KernelArguments(call.Arguments), ct);
}
catch (Exception ex)
{
result = new FunctionResult(matchingFn, value: $"Function call failed: {ex.Message}");
}
var resultContent = new FunctionResultContent(call.FunctionName, call.PluginName, call.Id, result.GetValue<object?>());
callResultList.Add(new ChatMessageContent(AuthorRole.Tool, [resultContent])
{
Content = JsonSerializer.Serialize(result.GetValue<object>(), JsonOpts),
});
}
if (callsToMake.Count != 0)
{
var callContent = new ChatMessageContent
{
Items = [..callContentList],
Metadata = new Dictionary<string, object?>
{
{ "tool_calls", callContentList },
},
Role = AuthorRole.Assistant,
Content = string.Empty
};
hist.Add(callContent);
hist.AddRange(callResultList);
}
var routerTokensUsed = GetTokensUsed(routingMsg);
_logger.LogInformation("Plugin usage: {Usage}", routerTokensUsed);
// -----------------------------------------------------------------
// ④ MAIN PASS (standard Exec settings)
// -----------------------------------------------------------------
var response = await svc.GetChatMessageContentAsync(hist, Exec(typeof(TOut).Name, schema), kernel, ct);
var json = response.Content.ExtractJson();
var tokensUsed = GetTokensUsed(response);
var toolCalls = hist.Where(x => x.Role == AuthorRole.Tool).Select(x => x.Content ?? string.Empty).Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
_logger.LogInformation("Chat usage: {Usage}", tokensUsed);
return string.IsNullOrWhiteSpace(json)
? new OpenAiResponse<TOut>(default, tokensUsed, [])
: new OpenAiResponse<TOut>(JsonSerializer.Deserialize<TOut>(json, JsonOpts), tokensUsed, toolCalls);
}
private static Kernel BuildKernel(OpenAISettings cfg, HttpClient http)
{
var b = Kernel.CreateBuilder();
b.Services.AddLogging(l => l.SetMinimumLevel(LogLevel.Information));
if (string.IsNullOrWhiteSpace(cfg.Key))
{
b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, new DefaultAzureCredential(), httpClient: http);
}
else
{
b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, cfg.Key, httpClient: http);
}
return b.Build();
}
private static int GetTokensUsed(ChatMessageContent content)
{
if (content.Metadata is not { } meta)
{
return 0;
}
if (meta.TryGetValue("Usage", out var usage) && usage is ChatTokenUsage tokenUsage)
{
return tokenUsage.TotalTokenCount;
}
return 0;
}
}