In [4]:
import pandas as pd

import torch
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from sentence_transformers import SentenceTransformer, util

DOLLY_MODEL = "databricks/dolly-v2-3b"
EMBEDDING_MODEL = "all-mpnet-base-v2"

USE_DOLLY_FOR_EMBEDDING = True


In [5]:
generate_text = pipeline(model=DOLLY_MODEL, torch_dtype=torch.bfloat16,
                         trust_remote_code=True, device_map="auto", return_full_text=True)
# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)



In [6]:

if USE_DOLLY_FOR_EMBEDDING:
    embedding_model = SentenceTransformer(DOLLY_MODEL)
else:
    embedding_model = SentenceTransformer(EMBEDDING_MODEL)

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/databricks_dolly-v2-3b. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/databricks_dolly-v2-3b were not used when initializing GPTNeoXModel: ['embed_out.weight']
- This IS expected if you are initializing GPTNeoXModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
df_text_chunks = pd.read_feather("../data/paper_extracts_embed.feature")
df_text_chunks.head()


Unnamed: 0,text_chunk,title,embeddings,embeddings_dolly
0,Texture Synthesis Using Convolutional NeuralNe...,Texture Synthesis Using Convolutional Neural N...,"[0.01643215, 0.08131, -0.051866785, 0.07214568...","[-0.4477233, 0.7085075, 0.9448252, -0.6605596,..."
1,2 Convolutional neural network We use the VGG...,Texture Synthesis Using Convolutional Neural N...,"[0.00494202, -0.0018913345, -6.29354e-05, 0.09...","[-0.16315894, 1.1724734, 0.34991983, -0.975609..."
2,3 different features. These feature correlati...,Texture Synthesis Using Convolutional Neural N...,"[0.040369663, 0.036883876, -0.026579408, 0.090...","[-0.5808077, 0.93010116, 0.35125256, -0.081901..."
3,conv1_1pool1pool2pool3pool4originalPortilla &...,Texture Synthesis Using Convolutional Neural N...,"[-0.005054468, -0.011686467, -0.05832806, 0.06...","[-1.1286112, 0.94930226, 0.83858913, -0.496184..."
4,6 Classification performance 1.00.80.60.4 top...,Texture Synthesis Using Convolutional Neural N...,"[0.02261693, -0.04751408, -0.008524225, 0.0593...","[-0.7846361, -0.06361116, 1.3205526, 1.9582235..."


In [8]:
query = "What is the goal of visual texture synthesis?"
query_emb = embedding_model.encode(query)
if USE_DOLLY_FOR_EMBEDDING:
    doc_emb = df_text_chunks["embeddings_dolly"]
else:
    doc_emb = df_text_chunks["embeddings"]

docs = df_text_chunks["text_chunk"]
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

  b = torch.tensor(b)


In [12]:
doc_score_pairs[0:3]

[('\x0c45 30 15 0 15 30 45 45 30 15 0 15 30 45 Figure 3: 3D view synthesis on Multi-PIE. For each panel, the first row shows the ground truth from 45to 45 , the second and third rows show the re-renderings of 6-step clockwise rotation from an input image of45 (red box) and of 6-step counter-clockwise rotation from an input image of 45 (red box), respectively.45 30 15 15 30 45 45 30 15 15 30 45 InputRNN3DmodelFigure 4: Comparing face pose normalization results with 3D morphable model [29].',
  2187.7333984375),
 ('Performance Comparisons of the Different mQA Variants In order to show the effectiveness of the different components and strategies of our mQA model, weimplement three variants of the mQA in Figure 2. For the first variant (i.e. mQA-avg-question), wereplace the first LSTM component of the model (i.e. the LSTM to extract the question embedding) Image QuestionAnswer What is in the plate? food. What is the dog doing? Surfing in the sea. Where is the cat? On the bed. What is there

In [13]:
context_chunks = "\n\n".join([doc_score_pairs[i][0] for i in [1]])
print(llm_context_chain.predict(instruction=query, context=context_chunks).lstrip())

The main goal of visual texture synthesis is to generate high-quality images to answer given a text description. For example, given a sentence “a grey cat is running behind a red car”, the goal of visual texture synthesis is to generate an image with cat and car.

We can see that the first variant (i.e. mQA-avg-question) is not good at generating image. It generates random examples with the answers given by the human judges. We can see that our mQA model can generate better answers with a lower word error rate.

The second variant (i.e. mQA-same-LSTMs) is much better than the first variant. We can see that the first variant uses the LSTM to extract the question embedding from the input sentence. It learns the sentence word pattern. It is much worse than the second variant.
