-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Closed
Labels
.NETIssue or Pull requests regarding .NET codeIssue or Pull requests regarding .NET codebugSomething isn't workingSomething isn't workingjavaIssue or PR regarding Java codeIssue or PR regarding Java codetriage
Description
Describe the bug
Calls to Azure OpenAI using a Provisioned Throughput Unit (PTU) for the gpt-4o-mini
model intermittently hang for up to 10+ seconds before failing with a timeout. These are low-token requests (<100 total tokens) and do not consistently reproduce, but the latency spikes are severe enough to trigger client-side timeouts and significantly degrade reliability.
The issue persists despite:
- Using
SocketsHttpHandler
withPooledConnectionLifetime = 1 minute
- Applying Polly retry and timeout strategies
- Rebuilding
SemanticKernel
andHttpClient
per request - Using dedicated PTU resources (no quota errors)
To Reproduce
Steps to reproduce the behavior:
- Deploy Azure OpenAI with a provisioned GPT-4o Mini deployment (
gpt-4o-mini
) - Use Semantic Kernel to call
GetChatMessageContentAsync(...)
on low-token prompts - Wrap the call in a timeout of 3–5 seconds
- Observe intermittent timeouts or long-running requests (>10s)
- Logs show no content returned and cancelled sockets (
OperationCanceledException
, sometimesSocketException
)
Expected behavior
Consistently low-latency completions from a provisioned GPT-4o Mini deployment.
Platform
- Language: C#
- Source: NuGet package
Microsoft.SemanticKernel
latest version - AI model: Azure OpenAI PTU —
gpt-4o-mini
- IDE: Rider
- OS: Windows 11 / Linux (reproduced on both)
Snippets
static IAsyncPolicy<HttpResponseMessage> GetRetryPolicy() =>
HttpPolicyExtensions
.HandleTransientHttpError()
.Or<TimeoutRejectedException>()
.WaitAndRetryAsync(3, _ => TimeSpan.Zero);
static IAsyncPolicy<HttpResponseMessage> GetTimeoutPolicy() =>
Policy.TimeoutAsync<HttpResponseMessage>(TimeSpan.FromSeconds(3));
public static IServiceCollection AddApplicationServices(this IServiceCollection services)
{
services.AddHttpClient();
services.AddSingleton<OpenAiRateLimitHandler>();
services.AddHttpClient(nameof(OpenAiClient))
.ConfigurePrimaryHttpMessageHandler(() => new SocketsHttpHandler
{
PooledConnectionLifetime = TimeSpan.FromMinutes(1),
ConnectTimeout = TimeSpan.FromSeconds(2),
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
MaxConnectionsPerServer = 50
})
.AddPolicyHandler(GetRetryPolicy())
.AddPolicyHandler(GetTimeoutPolicy())
.AddHttpMessageHandler<OpenAiRateLimitHandler>();
public sealed class OpenAiClient : IOpenAiClient
{
private readonly Kernel _kernel;
private readonly ILogger<OpenAiClient> _logger;
private readonly List<IPlugin> _plugins;
private static readonly JsonSerializerOptions JsonOpts = new()
{
PropertyNameCaseInsensitive = true,
Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
};
private static OpenAIPromptExecutionSettings Exec(string name, BinaryData schema) => new()
{
Temperature = 0f,
ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat(name, schema, jsonSchemaIsStrict: false),
FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
};
private static readonly OpenAIPromptExecutionSettings NonJsonExec = new()
{
Temperature = 0f,
FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
};
private static readonly OpenAIPromptExecutionSettings ManualExec = new()
{
Temperature = 0f,
ResponseFormat = ChatResponseFormat.CreateJsonSchemaFormat("Tool", BinaryData.FromString(UnhydratedToolCall.JsonSchema), jsonSchemaIsStrict: false),
FunctionChoiceBehavior = FunctionChoiceBehavior.None([]),
};
public OpenAiClient(IEnumerable<IPlugin> plugins,
IOptions<OpenAISettings> cfg,
ILogger<OpenAiClient> logger,
IHttpClientFactory httpFactory)
{
_logger = logger;
_kernel = BuildKernel(cfg.Value, httpFactory.CreateClient(nameof(OpenAiClient)));
_plugins = plugins.ToList();
}
public async Task<OpenAiResponse<string>> Generate(ChatHistory hist, CancellationToken ct)
{
var service = _kernel.GetRequiredService<IChatCompletionService>();
var response = await service.GetChatMessageContentAsync(hist, NonJsonExec, _kernel, ct);
var tokensUsed = GetTokensUsed(response);
var content = response.Content;
_logger.LogInformation("Chat usage: {Tokens}", tokensUsed);
return new OpenAiResponse<string>(content, tokensUsed, []);
}
public async Task<OpenAiResponse<TOut>> Generate<TOut>(string convoId, ChatHistory hist, BinaryData schema, PluginInfo pluginInfo, CancellationToken ct)
{
// -----------------------------------------------------------------
// ① Prepare kernel and register only requested plugins
// -----------------------------------------------------------------
var kernel = _kernel.Clone();
kernel.Data["conversationId"] = convoId;
var plugins = pluginInfo.Plugins.Select(x => _plugins.FirstOrDefault(p => p.Name == x))
.Where(x => x is not null)
.Cast<IPlugin>()
.ToList();
var pluginKernel = kernel.Clone();
foreach (var p in plugins)
{
pluginKernel.Plugins.AddFromObject(p, p.GetType().Name);
}
// -----------------------------------------------------------------
// ② ROUTING PASS (autoInvoke:false)
// Ask the LLM which of those plugins it actually wants to call
// -----------------------------------------------------------------
var systemPrompt = PluginRouter.CreateSystemPrompt(plugins);
var routerHist = new ChatHistory();
routerHist.AddSystemMessage(systemPrompt);
routerHist.AddUserMessage(pluginInfo.Prompt);
var svc = kernel.GetRequiredService<IChatCompletionService>();
var routingMsg = await svc.GetChatMessageContentAsync(routerHist, ManualExec, kernel, ct);
var callsToMake = JsonSerializer.Deserialize<UnhydratedToolCall>(routingMsg.Content.ExtractJson(), JsonOpts)?.Hydrated.ToList() ?? [];
var callContentList = callsToMake.Select(x => new FunctionCallContent(x.FunctionName, x.PluginName, x.Id)).ToList();
var callResultList = new List<ChatMessageContent>();
foreach (var call in callsToMake)
{
var matchingFn = pluginKernel.Plugins.TryGetFunction(call.PluginName, call.FunctionName, out var fn) ? fn : null;
if (matchingFn is null)
{
_logger.LogWarning("No matching function found for call: {Call}", call);
callContentList.RemoveAll(x => x.Id == call.Id);
continue;
}
FunctionResult result;
try
{
result = await matchingFn.InvokeAsync(kernel, new KernelArguments(call.Arguments), ct);
}
catch (Exception ex)
{
result = new FunctionResult(matchingFn, value: $"Function call failed: {ex.Message}");
}
var resultContent = new FunctionResultContent(call.FunctionName, call.PluginName, call.Id, result.GetValue<object?>());
callResultList.Add(new ChatMessageContent(AuthorRole.Tool, [resultContent])
{
Content = JsonSerializer.Serialize(result.GetValue<object>(), JsonOpts),
});
}
if (callsToMake.Count != 0)
{
var callContent = new ChatMessageContent
{
Items = [..callContentList],
Metadata = new Dictionary<string, object?>
{
{ "tool_calls", callContentList },
},
Role = AuthorRole.Assistant,
Content = string.Empty
};
hist.Add(callContent);
hist.AddRange(callResultList);
}
var routerTokensUsed = GetTokensUsed(routingMsg);
_logger.LogInformation("Plugin usage: {Usage}", routerTokensUsed);
// -----------------------------------------------------------------
// ④ MAIN PASS (standard Exec settings)
// -----------------------------------------------------------------
var response = await svc.GetChatMessageContentAsync(hist, Exec(typeof(TOut).Name, schema), kernel, ct);
var json = response.Content.ExtractJson();
var tokensUsed = GetTokensUsed(response);
var toolCalls = hist.Where(x => x.Role == AuthorRole.Tool).Select(x => x.Content ?? string.Empty).Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
_logger.LogInformation("Chat usage: {Usage}", tokensUsed);
return string.IsNullOrWhiteSpace(json)
? new OpenAiResponse<TOut>(default, tokensUsed, [])
: new OpenAiResponse<TOut>(JsonSerializer.Deserialize<TOut>(json, JsonOpts), tokensUsed, toolCalls);
}
private static Kernel BuildKernel(OpenAISettings cfg, HttpClient http)
{
var b = Kernel.CreateBuilder();
b.Services.AddLogging(l => l.SetMinimumLevel(LogLevel.Information));
if (string.IsNullOrWhiteSpace(cfg.Key))
{
b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, new DefaultAzureCredential(), httpClient: http);
}
else
{
b.AddAzureOpenAIChatCompletion(cfg.ChatDeploymentId, cfg.Endpoint, cfg.Key, httpClient: http);
}
return b.Build();
}
private static int GetTokensUsed(ChatMessageContent content)
{
if (content.Metadata is not { } meta)
{
return 0;
}
if (meta.TryGetValue("Usage", out var usage) && usage is ChatTokenUsage tokenUsage)
{
return tokenUsage.TotalTokenCount;
}
return 0;
}
}
Metadata
Metadata
Assignees
Labels
.NETIssue or Pull requests regarding .NET codeIssue or Pull requests regarding .NET codebugSomething isn't workingSomething isn't workingjavaIssue or PR regarding Java codeIssue or PR regarding Java codetriage