In [33]:
! optimum-cli export openvino --model "microsoft/Phi-3-mini-4k-instruct" --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.6  --sym  --trust-remote-code ./openvinomodel/phi3/int4


^C


In [34]:
from typing import List, Optional
from openvino.runtime import Core
from optimum.intel import OVModelForCausalLM

from nncf import compress_weights, CompressWeightsMode
import torch
from pydantic import BaseModel
from transformers import AutoConfig, AutoTokenizer
from fastapi import FastAPI, HTTPException, Response, status
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline



In [35]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate

In [36]:
from keyword_generator import extract_keywords
from db import get_db_collection, add_to_collection, query_collection

Inferencing using Phi-3

In [37]:
model_dir = './openvinomodel/phi3/int4'
model_id = "microsoft/Phi-3-mini-4k-instruct"

In [38]:
ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

In [39]:
model = OVModelForCausalLM.from_pretrained(
    model_dir,
    device='CPU',
    ov_config=ov_config,
    config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
    trust_remote_code=True,
)

The argument `trust_remote_code` is to be used along with export=True. It will be ignored.
Compiling the model to CPU ...


In [40]:
tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
model.model = compress_weights(model.model, mode=CompressWeightsMode.INT4_SYM, group_size=128, ratio=0.8) 

INFO:nncf:Statistics of the bitwidth distribution:
+----------------+-----------------------------+----------------------------------------+
| Num bits (N)   | % all parameters (layers)   | % ratio-defining parameters (layers)   |
+----------------+-----------------------------+----------------------------------------+


In [42]:
tokenizer_kwargs =  {"add_special_tokens": False}

In [43]:
prompt = "<|system|>You are a helpful AI assistant.<|end|><|user|>How old is the universe?<|end|><|assistant|>"
input_tokens = tok(prompt, return_tensors="pt", **tokenizer_kwargs)
prompt

'<|system|>You are a helpful AI assistant.<|end|><|user|>How old is the universe?<|end|><|assistant|>'

In [44]:
answer = model.generate(**input_tokens, max_new_tokens=1024)

In [45]:
tok.batch_decode(answer, skip_special_tokens=True)[0]

'You are a helpful AI assistant. How old is the universe? The universe is approximately 13.8 billion years old. This age is determined by measuring the rate of expansion of the universe, known as the Hubble constant, and extrapolating back to the Big Bang. The most precise measurements come from observations of the cosmic microwave background radiation, which is the afterglow of the Big Bang, and from the study of distant galaxies and their redshifts.'

PDF Reader using Phi-3

In [46]:
query_text = input("Ask anything about our final year project")
query_text

'What is the objective'

In [47]:
pip install chromadb




In [48]:
file_path = (
    "project-report.pdf"
)
loader = PyPDFLoader(file_path)
document = loader.load()
print("No. of pages in the document:", len(document))

No. of pages in the document: 23


In [49]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(document)

In [50]:
contents = []
ids = []
keywords = []

page_no = 0
c_index = -1
for index, doc in enumerate(chunked_documents):
    metadata = doc.metadata
    source = metadata['source'].replace('/','-').replace('.','-')

    if metadata['page'] > page_no:
        c_index = 0
    else:
        c_index += 1

    page_no = metadata['page']
    
    chunk_id = f"{source}-p{page_no}-c{c_index}"

    contents.append(doc.page_content)
    ids.append(chunk_id)
    keywords.append(extract_keywords(doc.page_content))
    print("Processed chunk:", chunk_id)

Processed chunk: project-report-pdf-p0-c0
Processed chunk: project-report-pdf-p1-c0
Processed chunk: project-report-pdf-p1-c1
Processed chunk: project-report-pdf-p2-c0
Processed chunk: project-report-pdf-p2-c1
Processed chunk: project-report-pdf-p3-c0
Processed chunk: project-report-pdf-p4-c0
Processed chunk: project-report-pdf-p5-c0
Processed chunk: project-report-pdf-p6-c0
Processed chunk: project-report-pdf-p6-c1
Processed chunk: project-report-pdf-p6-c2
Processed chunk: project-report-pdf-p7-c0
Processed chunk: project-report-pdf-p7-c1
Processed chunk: project-report-pdf-p7-c2
Processed chunk: project-report-pdf-p8-c0
Processed chunk: project-report-pdf-p8-c1
Processed chunk: project-report-pdf-p9-c0
Processed chunk: project-report-pdf-p10-c0
Processed chunk: project-report-pdf-p10-c1
Processed chunk: project-report-pdf-p10-c2
Processed chunk: project-report-pdf-p11-c0
Processed chunk: project-report-pdf-p11-c1
Processed chunk: project-report-pdf-p12-c0
Processed chunk: project-rep

In [51]:
COLLECTION_NAME = "my_project"
collection = get_db_collection(COLLECTION_NAME)


Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ------------

2024-07-15 12:48:27.604918: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 12:48:29.561904: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

Framework not specified. Using pt to export the model.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|█████     | 1/2 [01:16<01:16, 76.12s


Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:00 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ---------------------   0% 0/128 • 0:00:01 • -:--:--
Mixed-Precision assignment ------------

2024-07-15 12:48:04.066943: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 12:48:05.940095: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

Framework not specified. Using pt to export the model.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|█████     | 1/2 [00:49<00:49, 49.52s

In [None]:
metadata = [{"tags": ", ".join(i) } for i in keywords]
add_to_collection(collection, contents, ids, metadata)

Add of existing embedding ID: project-report-pdf-p0-c0
Add of existing embedding ID: project-report-pdf-p1-c0
Add of existing embedding ID: project-report-pdf-p1-c1
Add of existing embedding ID: project-report-pdf-p2-c0
Add of existing embedding ID: project-report-pdf-p2-c1
Add of existing embedding ID: project-report-pdf-p3-c0
Add of existing embedding ID: project-report-pdf-p4-c0
Add of existing embedding ID: project-report-pdf-p5-c0
Add of existing embedding ID: project-report-pdf-p6-c0
Add of existing embedding ID: project-report-pdf-p6-c1
Add of existing embedding ID: project-report-pdf-p6-c2
Add of existing embedding ID: project-report-pdf-p7-c0
Add of existing embedding ID: project-report-pdf-p7-c1
Add of existing embedding ID: project-report-pdf-p7-c2
Add of existing embedding ID: project-report-pdf-p8-c0
Add of existing embedding ID: project-report-pdf-p8-c1
Add of existing embedding ID: project-report-pdf-p9-c0
Add of existing embedding ID: project-report-pdf-p10-c0
Add of ex

Documents loaded to DB


In [None]:
query_result = query_collection(collection, query_text)
query_result

{'ids': [['project-report-pdf-p14-c0',
   'project-report-pdf-p13-c0',
   'project-report-pdf-p19-c0']],
 'distances': [[0.4650601348353639, 0.46821508990863936, 0.47756825589977725]],
 'metadatas': [[{'tags': 'arduino, controller, display, characters, interfaced'},
   {'tags': 'medicine, display, automatic, cavity, motor'},
   {'tags': 'medicine, automatic, components, design, machine'}]],
 'embeddings': None,
 'documents': [['AUTOMATIC MEDICINE VENDING MACHINE  \n  \n   \n       \n \n   DEPARTMENT OF MECHATRINICS ENGINEERING, MITE, MOODBIDRI  9  segment LEDs. The reasons being, LCDs are economical, easily programmable, have no \nlimitation of displaying special and even custom characters, animations and so on. A 16x2 \nLCD means it can display 16 characters per line and there are 2 such lines.  \n  \nFig 3.2.LCD Display  \n  \n  \n3.3 ARDUINO MICRO CONTROLLER  \n  \nArduino is an open source prototype platform which is easy to use as number of required \nmodules can be directly inter

In [None]:
text = ""
for doc in query_result['documents']:
    for i in doc:
        text += i

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
).format(context=text)

prompt = ChatPromptTemplate.from_messages(
    [
        
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
final_prompt = prompt.format(input=query_text)
final_prompt

"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nAUTOMATIC MEDICINE VENDING MACHINE  \n  \n   \n       \n \n   DEPARTMENT OF MECHATRINICS ENGINEERING, MITE, MOODBIDRI  9  segment LEDs. The reasons being, LCDs are economical, easily programmable, have no \nlimitation of displaying special and even custom characters, animations and so on. A 16x2 \nLCD means it can display 16 characters per line and there are 2 such lines.  \n  \nFig 3.2.LCD Display  \n  \n  \n3.3 ARDUINO MICRO CONTROLLER  \n  \nArduino is an open source prototype platform which is easy to use as number of required \nmodules can be directly interfaced on to the board and has an understandable software \nlanguage. The Arduino forms the main heart of the system, the purpose of using this \ncontroller is due to the fact that it redu

In [None]:
tokenizer_kwargs =  {"add_special_tokens": False}

In [None]:
input_tokens = tok(final_prompt, return_tensors="pt")

In [None]:
answer = model.generate(**input_tokens, max_new_tokens=1024)

In [None]:
finale=tok.batch_decode(answer, skip_special_tokens=True)
finale

["System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nAUTOMATIC MEDICINE VENDING MACHINE  \n  \n   \n       \n \n   DEPARTMENT OF MECHATRINICS ENGINEERING, MITE, MOODBIDRI  9  segment LEDs. The reasons being, LCDs are economical, easily programmable, have no \nlimitation of displaying special and even custom characters, animations and so on. A 16x2 \nLCD means it can display 16 characters per line and there are 2 such lines.  \n  \nFig 3.2.LCD Display  \n  \n  \n3.3 ARDUINO MICRO CONTROLLER  \n  \nArduino is an open source prototype platform which is easy to use as number of required \nmodules can be directly interfaced on to the board and has an understandable software \nlanguage. The Arduino forms the main heart of the system, the purpose of using this \ncontroller is due to the fact that it red