In [1]:
import httpx
import openai
import asyncio
import nest_asyncio

nest_asyncio.apply()

In [3]:
client = openai.OpenAI(api_key="YOUR-OPENAI-API-KEY")

In [4]:
response = client.chat.completions.create(
    model="gpt-4-vision-preview",
    messages=[{
        "role":
        "user",
        "content": [
            {
                "type": "text",
                "text": "What's in this image?"
            },
            {
                "type":
                "image_url",
                "image_url":
                "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
            },
        ],
    }],
    max_tokens=300,
)

In [7]:
response.choices[0].message.content

'This is an image of a wooden boardwalk extending through a lush green meadow or wetland area. The boardwalk is designed to allow people to walk through the area without disturbing the natural habitat. Tall green grasses flank both sides of the boardwalk, and a variety of shrubs and trees can be seen in the background. Above, there is a blue sky with some scattered clouds, suggesting a pleasant weather condition. This setting is peaceful and natural, likely a destination for leisurely walks or an opportunity to observe wildlife and nature.'

In [8]:
response

ChatCompletion(id='chatcmpl-8ItVy0wPYrINvQPwMSpvZAcCHbKfQ', choices=[Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='This is an image of a wooden boardwalk extending through a lush green meadow or wetland area. The boardwalk is designed to allow people to walk through the area without disturbing the natural habitat. Tall green grasses flank both sides of the boardwalk, and a variety of shrubs and trees can be seen in the background. Above, there is a blue sky with some scattered clouds, suggesting a pleasant weather condition. This setting is peaceful and natural, likely a destination for leisurely walks or an opportunity to observe wildlife and nature.', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'})], created=1699513994, model='gpt-4-1106-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=108, prompt_tokens=1118, total_tokens=1226))

In [4]:
async def chatCompletion(messages, model, api_key, **kwargs):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {"messages": messages, "model": model, **kwargs}
    async with httpx.AsyncClient(timeout=120) as request:
        response = await request.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            json=payload)
        print(response.text)
        response.raise_for_status()
        return response.json()

In [5]:
message = [{
    "role":
    "user",
    "content": [
        {
            "type":
            "text",
            "text":
            "What's in these images? Is there any difference between them?",
        },
        {
            "type":
            "image_url",
            "image_url":
            "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
        },
        {
            "type":
            "image_url",
            "image_url":
            "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
        },
    ],
}]
krgs = {"max_tokens": 512, "temperature": 0.2}
model = "gpt-4-vision-preview"
result = asyncio.run(
    chatCompletion(messages=message,
                   model=model,
                   api_key="YOUR-OPENAI-API-KEY",
                   **krgs))
result

{"id": "chatcmpl-8IyoVvNu38dgCZF00l5iyCBWfyZvu", "object": "chat.completion", "created": 1699534363, "model": "gpt-4-1106-vision-preview", "usage": {"prompt_tokens": 2230, "completion_tokens": 143, "total_tokens": 2373}, "choices": [{"message": {"role": "assistant", "content": "You've provided two images that appear to be identical at first glance. Both images depict a wooden boardwalk leading through a lush green field with tall grasses. The sky is partly cloudy with a mix of blue and white, suggesting a pleasant day. There are trees and shrubs in the distance, and the overall scene is one of natural beauty and tranquility.\n\nUpon closer inspection, since the images seem to be the same, any differences between them would likely be very subtle and could be related to slight variations in color, lighting, or compression artifacts. However, without any specific differences pointed out or without a higher resolution to examine fine details, it's challenging to identify any distinct diffe

{'id': 'chatcmpl-8IyoVvNu38dgCZF00l5iyCBWfyZvu',
 'object': 'chat.completion',
 'created': 1699534363,
 'model': 'gpt-4-1106-vision-preview',
 'usage': {'prompt_tokens': 2230,
  'completion_tokens': 143,
  'total_tokens': 2373},
 'choices': [{'message': {'role': 'assistant',
    'content': "You've provided two images that appear to be identical at first glance. Both images depict a wooden boardwalk leading through a lush green field with tall grasses. The sky is partly cloudy with a mix of blue and white, suggesting a pleasant day. There are trees and shrubs in the distance, and the overall scene is one of natural beauty and tranquility.\n\nUpon closer inspection, since the images seem to be the same, any differences between them would likely be very subtle and could be related to slight variations in color, lighting, or compression artifacts. However, without any specific differences pointed out or without a higher resolution to examine fine details, it's challenging to identify any d

In [2]:
import uuid


async def text2Speech(text: str, voice: str, api_key: str):
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    payload = {"model": "tts-1", "input": text, "voice": voice}
    async with httpx.AsyncClient(timeout=600) as request:
        response = await request.post(
            "https://api.openai.com/v1/audio/speech", headers=headers, json=payload
        )
        print(response.text)
        response.raise_for_status()
        fn = "".join(text.split()) + f"_{str(uuid.uuid4())}.mp3"
        with open(fn, "wb") as fp:
            fp.write(response.content)
        return fn

In [6]:
message = [
    {
        "role": "system",
        "content": "Provide the answer in Gujarati language."
    },
    {
        "role":
        "user",
        "content": [
            {
                "type": "text",
                "text": "Provide an explanation of what the image contains",
            },
            {
                "type":
                "image_url",
                "image_url":
                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/China_road_sign_路_77d.svg/1771px-China_road_sign_路_77d.svg.png",
            },
        ],
    },
]
krgs = {"max_tokens": 512, "temperature": 0.2}
model = "gpt-4-vision-preview"
result = asyncio.run(
    chatCompletion(messages=message,
                   model=model,
                   api_key="YOUR-OPENAI-API-KEY",
                   **krgs))
result

{"id": "chatcmpl-8Iyp0r1H1TEFTVBrJWvjac1HuoVa7", "object": "chat.completion", "created": 1699534394, "model": "gpt-4-1106-vision-preview", "usage": {"prompt_tokens": 792, "completion_tokens": 510, "total_tokens": 1302}, "choices": [{"message": {"role": "assistant", "content": "\u0a86 \u0a9b\u0aac\u0ac0\u0aae\u0abe\u0a82 \u0a8f\u0a95 \u0a9f\u0acd\u0ab0\u0abe\u0aab\u0abf\u0a95 \u0ab8\u0a82\u0a95\u0ac7\u0aa4 \u0aac\u0acb\u0ab0\u0acd\u0aa1 \u0aa6\u0ab0\u0acd\u0ab6\u0abe\u0ab5\u0ac7\u0ab2 \u0a9b\u0ac7, \u0a9c\u0ac7 \u0a9a\u0ac0\u0aa8\u0ac0 \u0aad\u0abe\u0ab7\u0abe\u0aae\u0abe\u0a82 \u0ab2\u0a96\u0abe\u0aa3 \u0ab8\u0abe\u0aa5\u0ac7 \u0a9b\u0ac7. \u0a89\u0aaa\u0ab0\u0aa8\u0abe \u0aad\u0abe\u0a97\u0aae\u0abe\u0a82 \u0a8f\u0a95 \u0a9f\u0acd\u0ab0\u0a95\u0aa8\u0ac1\u0a82 \u0a9a\u0abf\u0aa4\u0acd\u0ab0 \u0a9b\u0ac7 \u0a85\u0aa8\u0ac7 \u0aa4\u0ac7\u0aa8\u0ac0 \u0aac\u0abe\u0a9c\u0ac1\u0aae\u0abe\u0a82 \u0a9c\u0aae\u0aa3\u0ac0 \u0aac\u0abe\u0a9c\u0ac1 \u0ab5\u0abe\u0ab3\u0ac1\u0a82 \u0aa4\u0ac0\u0a

{'id': 'chatcmpl-8Iyp0r1H1TEFTVBrJWvjac1HuoVa7',
 'object': 'chat.completion',
 'created': 1699534394,
 'model': 'gpt-4-1106-vision-preview',
 'usage': {'prompt_tokens': 792,
  'completion_tokens': 510,
  'total_tokens': 1302},
 'choices': [{'message': {'role': 'assistant',
    'content': 'આ છબીમાં એક ટ્રાફિક સંકેત બોર્ડ દર્શાવેલ છે, જે ચીની ભાષામાં લખાણ સાથે છે. ઉપરના ભાગમાં એક ટ્રકનું ચિત્ર છે અને તેની બાજુમાં જમણી બાજુ વાળું તીર છે, જે કદાચ ટ્રક પાર્કિંગ કે ટ્રક માટેની વિશેષ લેન તરફ ઇશારો કરે છે. નીચેના ભાગમાં ચીની અક્ષરો છે, જેનો મતલબ મને ખબર નથી પરંતુ તે કદાચ સ્થળ કે દિશાઓની માહિતી આપતા હોઈ શકે છે.'},
   'finish_details': {'type': 'stop', 'stop': '<|fim_suffix|>'},
   'index': 0}]}

In [6]:
result_content = result.get("choices")[0].get("message").get("content")

In [7]:
result_content

'આ છબીમાં એક ટ્રાફિક સંકેત બોર્ડ દર્શાવેલ છે. ઉપરના ભાગમાં એક ટ્રકનું ચિત્ર છે જે વજન માપનારા સ્કેલ પર ઉભું છે, અને જમણી બાજુ એક તીર છે જે દર્શાવે છે કે ટ્રકોએ જમણી બાજુ વળવાનું છે. નીચેના ભાગમાં ચીની ભાષામાં લખાણ છે, જે કદાચ સ્થાનિક નિર્દેશો અથવા સૂચનાઓ હોઈ શકે છે.'

In [3]:
audio_result = asyncio.run(
    text2Speech(
        "આ છબીમાં એક ટ્રાફિક સંકેત બોર્ડ દર્શાવેલ છે. ઉપરના ભાગમાં એક ટ્રકનું ચિત્ર છે જે વજન માપનારા સ્કેલ પર ઉભું છે, અને જમણી બાજુ એક તીર છે જે દર્શાવે છે કે ટ્રકોએ જમણી બાજુ વળવાનું છે. નીચેના ભાગમાં ચીની ભાષામાં લખાણ છે, જે કદાચ સ્થાનિક નિર્દેશો અથવા સૂચનાઓ હોઈ શકે છે.",
        "alloy",
        "YOUR-OPENAI-API-KEY",
    )
)
audio_result