In [14]:
from dotenv import load_dotenv
import os, json
import requests

load_dotenv()
api_key = os.getenv("API_KEY", None)
project_id = os.getenv("PROJECT_ID", None)
ibm_cloud_iam_url = os.getenv("IAM_IBM_CLOUD_URL", None)

creds = {
    "url"    : "https://us-south.ml.cloud.ibm.com",
    "apikey" : api_key
}

params = {
    "decoding_method":"greedy",
    "max_new_tokens":3000,
    "min_new_tokens":1,
    # "temperature":0.1,
    "top_k":50,
    "top_p":1,
    # "stop_sequences":["```"],
}

In [39]:
# Prepare the payload and headers
payload = {
    "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
    "apikey": api_key
}
headers = {
    'Content-Type': "application/x-www-form-urlencoded"
}

# Make a POST request while ignoring SSL certificate verification
try:
    response = requests.post(f"https://{ibm_cloud_iam_url}/identity/token", data=payload, headers=headers, verify=False)
    
    # Check if the request was successful
    response.raise_for_status()

    # Parse the JSON response
    decoded_json = response.json()
    access_token = decoded_json["access_token"]
    # print(f"Access Token: {access_token}")
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")



In [40]:
model_id = "meta-llama/llama-3-2-90b-vision-instruct"

In [33]:
import requests
import base64

def image_to_text(imagefilename,query):
    pic = open(imagefilename,"rb").read()
    pic_base64 = base64.b64encode(pic)
    pic_string = pic_base64.decode("utf-8")
    # print(pic_string)

    url = "https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29"

    body = {
        "messages": [
            # {"role":"system","content":system_prompt}, 
            {"role":"user","content":[
                {"type":"text","text":query},
                {"type":"image_url",
                 "image_url":{
                     "url": f"data:image/jpeg;base64,{pic_string}"
                     }}]}
                ],
        "project_id": project_id,
        "model_id": model_id,
        "decoding_method": "greedy",
        "repetition_penalty": 1,
        "max_tokens": 1000
    }

    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.post(
        url,
        headers=headers,
        json=body,
        stream=True
    )

    if response.status_code != 200:
        raise Exception("Non-200 response: " + str(response.text))

    # response.text

    response_data = json.loads(response.text)

    # Extract the content
    quoted_string = response_data['choices'][0]['message']['content']

    # Remove unwanted escape characters (like \n)
    unquoted_string = quoted_string.replace('\\n', '\n').replace('\\', '')

    # Print the unquoted string
    return unquoted_string


In [41]:

answer = image_to_text("examples/ocr_document.png","you are a OCR engine, please extract full text from the page")
print(answer)

The page contains a title, "Granite Code Models: A Family of Open Foundation Models for Code Intelligence," followed by a list of authors and their affiliations. The main content of the page is an abstract that describes the purpose and benefits of Granite Code models, which are large language models trained on code to improve software development productivity. The abstract highlights the potential of these models to handle complex tasks autonomously and their ability to perform well across a range of coding tasks.

The page also includes a link to the GitHub repository where the models can be accessed, as well as a brief introduction that provides context for the development of Granite Code models. Overall, the page appears to be a promotional material for the Granite Code models, highlighting their capabilities and potential applications in software development.

Here is the full text from the page:

**Granite Code Models: A Family of Open Foundation Models for Code Intelligence**

*

In [35]:

answer = image_to_text("examples/webpage.png","generate an html file learn from the screenshot image provided")
print(answer)

Sure, here is an HTML file that mimics the screenshot provided:

```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>IBM Sustainability Solutions</title>
    <link rel="stylesheet" href="styles.css">
</head>
<body>
    <header>
        <nav>
            <ul>
                <li><a>Products</a></li>
                <li><a>Solutions</a></li>
                <li><a>Consulting</a></li>
                <li><a>Support</a></li>
                <li><a>Think</a></li>
                <li><a>TechXchange 2024</a></li>
            </ul>
        </nav>
    </header>
    <main>
        <section class="hero">
            <h1>Harness the power of AI to act on sustainability goals</h1>
            <p>Explore IBM's sustainability solutions using data and AI to elevate business—and the planet</p>
            <div class="call-to-action">
                <a>Read the State of Sustainability Readiness 

In [25]:
from IPython.display import display, Markdown

answer = image_to_text("examples/flowchart.png",
                       """Generate a flow chart base on the diagram
-generate in mermaid""")
# print(answer)
display(Markdown(answer))


Certainly Here's a flowchart based on the diagram you provided, generated in Mermaid format:

```mermaid
graph TD
    Start[Power On]
    Follow route
    Generate Map and Location
    Plan route
    Scan Environment
    Vacuum Off
    Battery Low
    No
    Vacuum On
    Power Off
    Return power block
    Vacuum Off
    Stop moving
    Error indicator On
    End
```

Please note that the diagram provided is quite complex, and some of the decision points and flow paths are not explicitly detailed. This flowchart represents a simplified version of the process based on the diagram's main steps and decision points.

In [36]:
from IPython.display import display, Code

answer = image_to_text("examples/classdiagram.png","Generate a Java code base on the diagram")
display(Code(answer,language="java"))