In [6]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
from openai import OpenAI
import os 
import sys 
from pprint import pprint
from IPython.display import display, Markdown, Latex

sys.path.append('../')

from utils.pdf import read_pdf
from pydantic import BaseModel, Field
from typing import List
import json

from pipeline.entity_extraction import ENTITY_EXTRACTION_SYSTEM_PROMPT, EntityExtractionModel
from pipeline.helpers import get_json_response, get_messages_response
from datasets import Dataset
from huggingface_hub import HfApi

from dotenv import find_dotenv, load_dotenv

load_dotenv()

True

In [22]:
model = "ollama/llama3.2"
ollama_base_url = "http://localhost:11434/v1"
lm_studio_base_url = "http://localhost:1234/v1"
pdf_text = read_pdf("../data/Taming Transformers for High Resolution Image Synthesis.pdf") 

lm_studio_client = OpenAI(base_url=lm_studio_base_url, api_key="lm_studio")

In [23]:
entities = get_json_response(
    client=lm_studio_client,
    # model="llama-3.2-3b-instruct-q8_0"
    model="meta-llama-3.1-8b-instruct-q6_k",
    messages=[
        
        {
            "role": "system",
            "content": ENTITY_EXTRACTION_SYSTEM_PROMPT.format(text=pdf_text[0:20000])
        },

    ],
    response_format=EntityExtractionModel,
)

INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"


In [25]:
len(entities.entities)

33

In [40]:
from pipeline.question_answer import QUESTION_EXTRACTION_SYSTEM_PROMPT, QuestionAnswerModel

In [41]:

qa_prompt = QUESTION_EXTRACTION_SYSTEM_PROMPT.format(text=pdf_text[0:20000], 
                                               entities=",".join(entities.entities[0:10]),
                                               source="Taming Transformers for High Resolution Image Synthesis",
                                               source_type="paper" # this can get automated
                                               )


In [42]:
questions = get_json_response(
    client=lm_studio_client,
    # model="llama-3.2-3b-instruct-q8_0"
    model="meta-llama-3.1-8b-instruct-q6_k",
    messages=[
        
        {
            "role": "system",
            "content":qa_prompt
        },

    ],
    response_format=QuestionAnswerModel,
)

INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"


In [43]:
questions.questions

["What is the purpose of using a convolutional approach in conjunction with transformers to model high-resolution images, as described in the paper 'Taming Transformers for High-Resolution Image Synthesis'?",
 "Why does the use of transformers in image synthesis pose fundamental problems for scaling them to high-resolution images, according to the paper 'Taming Transformers for High-Resolution Image Synthesis'?",
 "Summarize the method of learning an effective codebook of image constituents using a convolutional VQGAN and an autoregressive transformer architecture as presented in the paper 'Taming Transformers for High-Resolution Image Synthesis.'",
 "Where are the learnable tokens, or prompts, added in the process of adapting vision transformers to a new domain, as explained in the paper 'Visual prompt tuning'?",
 "What is the key insight that enables combining convolutional and transformer architectures to model the compositional nature of visual data, according to the paper 'Taming 

In [48]:
def generate_qa_pairs(questions: List[str]) -> List[dict]:
    qa_pairs = []
    for question in questions:
        answer = get_messages_response(
                                    client=lm_studio_client,
                                    model="meta-llama-3.1-8b-instruct-q6_k",
                                    messages=[
                                        
                                        {
                                            "role": "system",
                                            "content":qa_prompt
                                        },

                                        {
                                            "role": "user",
                                            "content": question
                                        },

                                    ],
                                )

        qa_pairs.append({
            "question": question,
            "answer": answer
        })
    return qa_pairs

In [49]:
qa_pairs = generate_qa_pairs(questions.questions)

INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"


In [53]:
print(qa_pairs[5].get('answer'))

Here's why learning an effective codebook of image constituents requires pushing the limits of compression and using a perceptual loss with a patch-based discriminator:

1.  **High-resolution images require complex representations**: High-resolution images contain intricate details, textures, and patterns that are difficult to capture with simple or low-dimensional representations. To effectively represent these images, the codebook needs to be rich and diverse, which requires pushing the limits of compression.
2.  **Perceptual loss is necessary for preserving image quality**: The authors use a perceptual loss function, which measures the difference between the original image and its reconstructed version in terms of human perception. This loss function helps preserve the image's quality and details, even when compressing it to a lower dimensionality.
3.  **Patch-based discriminator improves compression efficiency**: A patch-based discriminator is used to differentiate between real and

### Create HF Dataset

In [63]:
from huggingface_hub import HfApi

# Initialize API client
api = HfApi()
repo_id = "owenren/532_finetune_qa_datasets"

api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, private=True, token=os.getenv('HUGGINGFACE_API_KEY'))

RepoUrl('https://huggingface.co/datasets/owenren/532_finetune_qa_datasets', endpoint='https://huggingface.co', repo_type='dataset', repo_id='owenren/532_finetune_qa_datasets')

In [64]:
for qa_pair in qa_pairs:
    qa_pair['source'] = "Taming Transformers for High Resolution Image Synthesis.pdf"

In [67]:
qa_pairs_dict = {
    "question": [item["question"] for item in qa_pairs],
    "answer": [item["answer"] for item in qa_pairs],
    "source": [item["source"] for item in qa_pairs]
}

In [68]:
dataset = Dataset.from_dict(qa_pairs_dict)

In [72]:
Dataset

datasets.arrow_dataset.Dataset

In [70]:
dataset.push_to_hub("owenren/532_finetune_qa_datasets", 
                        config_name="test_dataset_2024OCT20",  # Give it a unique name
                        token=os.getenv('HUGGINGFACE_API_KEY'))

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 528.18ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/owenren/532_finetune_qa_datasets/commit/4b464d52ebc49b502d194506b50db8dc4970fe89', commit_message='Upload dataset', commit_description='', oid='4b464d52ebc49b502d194506b50db8dc4970fe89', pr_url=None, pr_revision=None, pr_num=None)

In [45]:
answer = get_messages_response(
    client=lm_studio_client,
    # model="llama-3.2-3b-instruct-q8_0"
    model="meta-llama-3.1-8b-instruct-q6_k",
    messages=[
        
        {
            "role": "system",
            "content":qa_prompt
        },

        {
            "role": "user",
            "content": questions.questions[0]
        },

    ],
)

INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"


In [47]:
print(answer)

According to the paper "Taming Transformers for High-Resolution Image Synthesis", the authors propose combining the effectiveness of convolutional architectures with the expressivity of transformers to model and synthesize high-resolution images. The purpose of using a convolutional approach in conjunction with transformers is to:

1. **Efficiently learn local structure**: Convolutional neural networks (CNNs) are well-suited for learning local structures and regularities in images, which is essential for modeling high-resolution images.
2. **Capture context-rich visual parts**: The authors use a convolutional VQGAN (Vector Quantized Generative Adversarial Network) to learn a codebook of context-rich visual parts, which can be used as input to the transformer architecture.
3. **Reduce computational costs**: By using a convolutional approach to learn local structure and capture context-rich visual parts, the authors aim to reduce the computational costs associated with modeling high-reso