Skip to content

Commit 5dd5a9a

Browse files
authored
.Net: Introduce support for response modalities and audio options in AzureClientCore (#12523)
### Motivation and Context This change extends `AzureClientCore` to handle **response modalities** and **audio options** dynamically based on user-provided `executionSettings`. **Why this is needed:** Currently, the OpenAI connector supports audio modalities through the `Modalities` and `Audio` properties in `OpenAIPromptExecutionSettings`, but the Azure OpenAI connector doesn't fully implement this functionality. The code for handling audio exists in `AzureClientCore.cs` but isn't included in `AzureClientCore.ChatCompletion.cs`. ### Description * Introduced `GetResponseModalities` and `GetAudioOptions` helper methods. These follow the same logic as the equivalent methods in the **OpenAI** connector to ensure consistent behaviour and reduce duplication across both clients. * Updated `CreateChatCompletionOptions` to: * Parse and apply `ResponseModalities` if specified in `executionSettings`. * Parse and apply `AudioOptions` if specified in `executionSettings`. * Ensured backward compatibility: defaults remain unchanged if these options are not provided. ### Contribution Checklist * [x] The code builds cleanly without any errors or warnings. * [x] The PR follows the [[SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the pre-submission formatting script raises no violations. * [x] All unit tests pass, and I didn't create new tests as following the same in OpenAI. * [x] Verified that no existing functionality is broken. 😊
1 parent 16d708e commit 5dd5a9a

File tree

2 files changed

+480
-4
lines changed

2 files changed

+480
-4
lines changed

dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs

Lines changed: 337 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1809,6 +1809,343 @@ public async Task GetStreamingChatMessageContentsWithFunctionCallAndEmptyArgumen
18091809
Assert.Equal(1, functionCallCount);
18101810
}
18111811

1812+
// Sample audio content for testing
1813+
private static readonly byte[] s_sampleAudioBytes = { 0x01, 0x02, 0x03, 0x04 };
1814+
1815+
[Fact]
1816+
public async Task ItSendsAudioContentCorrectlyAsync()
1817+
{
1818+
// Arrange
1819+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
1820+
1821+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
1822+
{
1823+
Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json"))
1824+
};
1825+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
1826+
1827+
var chatHistory = new ChatHistory();
1828+
chatHistory.AddUserMessage([
1829+
new TextContent("What's in this audio?"),
1830+
new AudioContent(s_sampleAudioBytes, "audio/mp3")
1831+
]);
1832+
1833+
// Act
1834+
await service.GetChatMessageContentsAsync(chatHistory);
1835+
1836+
// Assert
1837+
var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!);
1838+
Assert.NotNull(actualRequestContent);
1839+
var optionsJson = JsonSerializer.Deserialize<JsonElement>(actualRequestContent);
1840+
1841+
var messages = optionsJson.GetProperty("messages");
1842+
Assert.Equal(1, messages.GetArrayLength());
1843+
1844+
var contentItems = messages[0].GetProperty("content");
1845+
Assert.Equal(2, contentItems.GetArrayLength());
1846+
1847+
Assert.Equal("text", contentItems[0].GetProperty("type").GetString());
1848+
Assert.Equal("What's in this audio?", contentItems[0].GetProperty("text").GetString());
1849+
1850+
Assert.Equal("input_audio", contentItems[1].GetProperty("type").GetString());
1851+
1852+
// Check for the audio data
1853+
Assert.True(contentItems[1].TryGetProperty("input_audio", out var audioData));
1854+
Assert.Equal(JsonValueKind.Object, audioData.ValueKind);
1855+
Assert.True(audioData.TryGetProperty("data", out var dataProperty));
1856+
var base64Audio = dataProperty.GetString();
1857+
Assert.True(audioData.TryGetProperty("format", out var formatProperty));
1858+
Assert.Equal("mp3", formatProperty.GetString());
1859+
1860+
Assert.NotNull(base64Audio);
1861+
Assert.Equal(Convert.ToBase64String(s_sampleAudioBytes), base64Audio);
1862+
}
1863+
1864+
[Fact]
1865+
public async Task ItHandlesAudioContentInResponseAsync()
1866+
{
1867+
// Arrange
1868+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
1869+
1870+
// Create a response with audio content
1871+
var responseJson = """
1872+
{
1873+
"model": "gpt-4o",
1874+
"choices": [
1875+
{
1876+
"message": {
1877+
"role": "assistant",
1878+
"content": "This is the text response.",
1879+
"audio": {
1880+
"data": "AQIDBA=="
1881+
}
1882+
},
1883+
"finish_reason": "stop"
1884+
}
1885+
],
1886+
"usage": {
1887+
"prompt_tokens": 10,
1888+
"completion_tokens": 20,
1889+
"total_tokens": 30
1890+
}
1891+
}
1892+
""";
1893+
1894+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
1895+
{
1896+
Content = new StringContent(responseJson)
1897+
};
1898+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
1899+
1900+
var settings = new AzureOpenAIPromptExecutionSettings
1901+
{
1902+
Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio,
1903+
Audio = new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3)
1904+
};
1905+
1906+
// Act
1907+
var result = await service.GetChatMessageContentAsync(new ChatHistory("test"), settings);
1908+
1909+
// Assert
1910+
Assert.NotNull(result);
1911+
Assert.Equal("This is the text response.", result.Content);
1912+
Assert.Equal(2, result.Items.Count);
1913+
1914+
var textContent = result.Items[0] as TextContent;
1915+
Assert.NotNull(textContent);
1916+
Assert.Equal("This is the text response.", textContent.Text);
1917+
1918+
var audioContent = result.Items[1] as AudioContent;
1919+
Assert.NotNull(audioContent);
1920+
Assert.NotNull(audioContent.Data);
1921+
Assert.Equal(4, audioContent.Data.Value.Length);
1922+
Assert.Equal(s_sampleAudioBytes[0], audioContent.Data.Value.Span[0]);
1923+
Assert.Equal(s_sampleAudioBytes[1], audioContent.Data.Value.Span[1]);
1924+
Assert.Equal(s_sampleAudioBytes[2], audioContent.Data.Value.Span[2]);
1925+
Assert.Equal(s_sampleAudioBytes[3], audioContent.Data.Value.Span[3]);
1926+
}
1927+
1928+
[Fact]
1929+
public async Task ItHandlesAudioContentWithMetadataInResponseAsync()
1930+
{
1931+
// Arrange
1932+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
1933+
1934+
// Create a response with audio content including metadata
1935+
var responseJson = """
1936+
{
1937+
"model": "gpt-4o",
1938+
"choices": [
1939+
{
1940+
"message": {
1941+
"role": "assistant",
1942+
"content": "This is the text response.",
1943+
"audio": {
1944+
"id": "audio-123456",
1945+
"data": "AQIDBA==",
1946+
"transcript": "This is the audio transcript.",
1947+
"expires_at": 1698765432
1948+
}
1949+
},
1950+
"finish_reason": "stop"
1951+
}
1952+
],
1953+
"usage": {
1954+
"prompt_tokens": 10,
1955+
"completion_tokens": 20,
1956+
"total_tokens": 30
1957+
}
1958+
}
1959+
""";
1960+
1961+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
1962+
{
1963+
Content = new StringContent(responseJson)
1964+
};
1965+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
1966+
1967+
var settings = new AzureOpenAIPromptExecutionSettings
1968+
{
1969+
Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio,
1970+
Audio = new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3)
1971+
};
1972+
1973+
// Act
1974+
var result = await service.GetChatMessageContentAsync(new ChatHistory("test"), settings);
1975+
1976+
// Assert
1977+
Assert.NotNull(result);
1978+
Assert.Equal("This is the text response.", result.Content);
1979+
Assert.Equal(2, result.Items.Count);
1980+
1981+
var textContent = result.Items[0] as TextContent;
1982+
Assert.NotNull(textContent);
1983+
Assert.Equal("This is the text response.", textContent.Text);
1984+
1985+
var audioContent = result.Items[1] as AudioContent;
1986+
Assert.NotNull(audioContent);
1987+
Assert.NotNull(audioContent.Data);
1988+
Assert.Equal(4, audioContent.Data.Value.Length);
1989+
Assert.Equal(s_sampleAudioBytes[0], audioContent.Data.Value.Span[0]);
1990+
Assert.Equal(s_sampleAudioBytes[1], audioContent.Data.Value.Span[1]);
1991+
Assert.Equal(s_sampleAudioBytes[2], audioContent.Data.Value.Span[2]);
1992+
Assert.Equal(s_sampleAudioBytes[3], audioContent.Data.Value.Span[3]);
1993+
1994+
// Verify audio metadata
1995+
Assert.NotNull(audioContent.Metadata);
1996+
Assert.Equal("audio-123456", audioContent.Metadata["Id"]);
1997+
Assert.Equal("This is the audio transcript.", audioContent.Metadata["Transcript"]);
1998+
Assert.NotNull(audioContent.Metadata["ExpiresAt"]);
1999+
// The ExpiresAt value is converted to a DateTime object, so we can't directly compare it to the Unix timestamp
2000+
}
2001+
2002+
[Theory]
2003+
[MemberData(nameof(ResponseModalitiesData))]
2004+
public async Task ItCreatesCorrectResponseModalitiesAsync(object responseModalities, string expectedJson)
2005+
{
2006+
// Arrange
2007+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
2008+
2009+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
2010+
{
2011+
Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json"))
2012+
};
2013+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
2014+
2015+
var settings = new AzureOpenAIPromptExecutionSettings
2016+
{
2017+
Modalities = responseModalities
2018+
};
2019+
2020+
// Act
2021+
await service.GetChatMessageContentsAsync(new ChatHistory("test"), settings);
2022+
2023+
// Assert
2024+
var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!);
2025+
Assert.NotNull(actualRequestContent);
2026+
var optionsJson = JsonSerializer.Deserialize<JsonElement>(actualRequestContent);
2027+
Assert.True(optionsJson.TryGetProperty("modalities", out var property));
2028+
Assert.Equal(expectedJson, property.GetRawText());
2029+
}
2030+
2031+
[Theory]
2032+
[MemberData(nameof(ResponseModalitiesData))]
2033+
public async Task ItCreatesCorrectResponseModalitiesStreamingAsync(object responseModalities, string expectedJson)
2034+
{
2035+
// Arrange
2036+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
2037+
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(AzureOpenAITestHelper.GetTestResponse("chat_completion_streaming_test_response.txt")));
2038+
2039+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
2040+
{
2041+
Content = new StreamContent(stream)
2042+
};
2043+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
2044+
2045+
var settings = new AzureOpenAIPromptExecutionSettings
2046+
{
2047+
Modalities = responseModalities
2048+
};
2049+
2050+
// Act
2051+
var asyncEnumerable = service.GetStreamingChatMessageContentsAsync(new ChatHistory("test"), settings);
2052+
await asyncEnumerable.GetAsyncEnumerator().MoveNextAsync();
2053+
2054+
// Assert
2055+
var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!);
2056+
Assert.NotNull(actualRequestContent);
2057+
var optionsJson = JsonSerializer.Deserialize<JsonElement>(actualRequestContent);
2058+
Assert.True(optionsJson.TryGetProperty("modalities", out var property));
2059+
Assert.Equal(expectedJson, property.GetRawText());
2060+
}
2061+
2062+
[Theory]
2063+
[MemberData(nameof(AudioOptionsData))]
2064+
public async Task ItCreatesCorrectAudioOptionsAsync(object audioOptions, string expectedJson)
2065+
{
2066+
// Arrange
2067+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
2068+
2069+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
2070+
{
2071+
Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json"))
2072+
};
2073+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
2074+
2075+
var settings = new AzureOpenAIPromptExecutionSettings
2076+
{
2077+
Audio = audioOptions
2078+
};
2079+
2080+
// Act
2081+
await service.GetChatMessageContentsAsync(new ChatHistory("test"), settings);
2082+
2083+
// Assert
2084+
var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!);
2085+
Assert.NotNull(actualRequestContent);
2086+
var optionsJson = JsonSerializer.Deserialize<JsonElement>(actualRequestContent);
2087+
Assert.True(optionsJson.TryGetProperty("audio", out var property));
2088+
Assert.Equal(JsonValueKind.Object, property.ValueKind);
2089+
Assert.Equal(expectedJson, property.GetRawText());
2090+
}
2091+
2092+
[Theory]
2093+
[MemberData(nameof(AudioOptionsData))]
2094+
public async Task ItCreatesCorrectAudioOptionsStreamingAsync(object audioOptions, string expectedJson)
2095+
{
2096+
// Arrange
2097+
var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient);
2098+
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(AzureOpenAITestHelper.GetTestResponse("chat_completion_streaming_test_response.txt")));
2099+
2100+
using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK)
2101+
{
2102+
Content = new StreamContent(stream)
2103+
};
2104+
this._messageHandlerStub.ResponsesToReturn.Add(responseMessage);
2105+
2106+
var settings = new AzureOpenAIPromptExecutionSettings
2107+
{
2108+
Audio = audioOptions
2109+
};
2110+
2111+
// Act
2112+
var asyncEnumerable = service.GetStreamingChatMessageContentsAsync(new ChatHistory("test"), settings);
2113+
await asyncEnumerable.GetAsyncEnumerator().MoveNextAsync();
2114+
2115+
// Assert
2116+
var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!);
2117+
Assert.NotNull(actualRequestContent);
2118+
var optionsJson = JsonSerializer.Deserialize<JsonElement>(actualRequestContent);
2119+
Assert.True(optionsJson.TryGetProperty("audio", out var property));
2120+
Assert.Equal(JsonValueKind.Object, property.ValueKind);
2121+
Assert.Equal(expectedJson, property.GetRawText());
2122+
}
2123+
2124+
// Add these theory data members to the class:
2125+
2126+
public static TheoryData<object, string> ResponseModalitiesData => new()
2127+
{
2128+
{ ChatResponseModalities.Text, "[\"text\"]" },
2129+
{ ChatResponseModalities.Audio, "[\"audio\"]" },
2130+
{ ChatResponseModalities.Text | ChatResponseModalities.Audio, "[\"text\",\"audio\"]" },
2131+
{ new[] { "text" }, "[\"text\"]" },
2132+
{ new[] { "audio" }, "[\"audio\"]" },
2133+
{ new[] { "text", "audio" }, "[\"text\",\"audio\"]" },
2134+
{ "Text", "[\"text\"]" },
2135+
{ "Audio", "[\"audio\"]" },
2136+
{ JsonSerializer.Deserialize<JsonElement>("\"text\""), "[\"text\"]" },
2137+
{ JsonSerializer.Deserialize<JsonElement>("\"audio\""), "[\"audio\"]" },
2138+
{ JsonSerializer.Deserialize<JsonElement>("[\"text\", \"audio\"]"), "[\"text\",\"audio\"]" },
2139+
};
2140+
2141+
public static TheoryData<object, string> AudioOptionsData => new()
2142+
{
2143+
{ new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3), "{\"voice\":\"alloy\",\"format\":\"mp3\"}" },
2144+
{ new ChatAudioOptions(ChatOutputAudioVoice.Echo, ChatOutputAudioFormat.Opus), "{\"voice\":\"echo\",\"format\":\"opus\"}" },
2145+
{ JsonSerializer.Deserialize<JsonElement>("{\"voice\":\"alloy\",\"format\":\"mp3\"}"), "{\"voice\":\"alloy\",\"format\":\"mp3\"}" },
2146+
{ "{\"voice\":\"echo\",\"format\":\"opus\"}", "{\"voice\":\"echo\",\"format\":\"opus\"}" },
2147+
};
2148+
18122149
public static TheoryData<string?, string?> Versions => new()
18132150
{
18142151
{ "V2025_03_01_preview", "2025-03-01-preview" },

0 commit comments

Comments
 (0)