In [3]:
import pandas as pd

import torch
from transformers import pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from sentence_transformers import SentenceTransformer, util

DOLLY_MODEL = "databricks/dolly-v2-3b"
EMBEDDING_MODEL = ["all-mpnet-base-v2", "sentence-transformers/use-cmlm-multilingual"][1]

USE_DOLLY_FOR_EMBEDDING = False


In [14]:
generate_text = pipeline(model=DOLLY_MODEL, torch_dtype=torch.bfloat16,
                         trust_remote_code=True, device_map="auto", return_full_text=True, do_sample=False)
# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)



In [4]:

if USE_DOLLY_FOR_EMBEDDING:
    embedding_model = SentenceTransformer(DOLLY_MODEL)
else:
    embedding_model = SentenceTransformer(EMBEDDING_MODEL)

Some weights of the model checkpoint at /home/thusitha/.cache/torch/sentence_transformers/sentence-transformers_use-cmlm-multilingual/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
df_text_chunks = pd.read_feather("../data/paper_extracts_embed.feature")
df_text_chunks.head()


Unnamed: 0,text_chunk,title,embeddings,embeddings_dolly,embeddings_use
0,Texture Synthesis Using Convolutional NeuralNe...,Texture Synthesis Using Convolutional Neural N...,"[0.01643215, 0.08131, -0.051866785, 0.07214568...","[-0.4477233, 0.7085075, 0.9448252, -0.6605596,...","[-0.11053032, 0.036556076, 0.029679747, 0.0063..."
1,2 Convolutional neural network We use the VGG...,Texture Synthesis Using Convolutional Neural N...,"[0.00494202, -0.0018913345, -6.29354e-05, 0.09...","[-0.16315894, 1.1724734, 0.34991983, -0.975609...","[-0.062598296, 0.03323136, -0.0032446731, -0.0..."
2,3 different features. These feature correlati...,Texture Synthesis Using Convolutional Neural N...,"[0.040369663, 0.036883876, -0.026579408, 0.090...","[-0.5808077, 0.93010116, 0.35125256, -0.081901...","[-0.060262118, 0.016366797, 0.020364584, 0.033..."
3,conv1_1pool1pool2pool3pool4originalPortilla &...,Texture Synthesis Using Convolutional Neural N...,"[-0.005054468, -0.011686467, -0.05832806, 0.06...","[-1.1286112, 0.94930226, 0.83858913, -0.496184...","[-0.07421292, -0.0026081826, 0.03418953, 0.043..."
4,6 Classification performance 1.00.80.60.4 top...,Texture Synthesis Using Convolutional Neural N...,"[0.02261693, -0.04751408, -0.008524225, 0.0593...","[-0.7846361, -0.06361116, 1.3205526, 1.9582235...","[-0.04890534, -0.0038033319, 0.05069413, 0.020..."


In [7]:


def get_similarity(embedding_model, query, doc_emb, docs):
    query_emb = embedding_model.encode(query)
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
    #Combine docs & scores
    doc_score_pairs = list(zip(docs, scores))
    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    return doc_score_pairs






In [12]:
if USE_DOLLY_FOR_EMBEDDING:
    doc_emb = df_text_chunks["embeddings_dolly"]
else:
    if EMBEDDING_MODEL == "sentence-transformers/use-cmlm-multilingual":
        doc_emb = df_text_chunks["embeddings_use"]
    else:
        doc_emb = df_text_chunks["embeddings"]

def get_context(query, top_n = 3, docs = df_text_chunks["text_chunk"], doc_emb=doc_emb, embedding_model=embedding_model):
    doc_score_pairs = get_similarity(embedding_model, query, doc_emb, docs)
    context_chunks = "\n\n".join([doc_score_pairs[i][0] for i in range(top_n)])
    return context_chunks


query = ["What is the goal of visual texture synthesis?", "what is VGG-19", "What are Gram matrices used for"][0]
context = get_context(query=query, top_n=2)

print("\n----------------With Context--------------------\n")
print(llm_context_chain.predict(instruction=query, context=context).lstrip())

print("\n----------------Without Context--------------------\n")
print(llm_context_chain.predict(instruction=query, context="").lstrip())

In [13]:
context

'Texture Synthesis Using Convolutional NeuralNetworksLeon A. GatysCentre for Integrative Neuroscience, University of Tubingen, GermanyBernstein Center for Computational Neuroscience, Tubingen, GermanyGraduate School of Neural Information Processing, University of Tubingen, Germanyleon.gatys@bethgelab.orgAlexander S. EckerCentre for Integrative Neuroscience, University of Tubingen, GermanyBernstein Center for Computational Neuroscience, Tubingen, GermanyMax Planck Institute for Biological Cybernetics, Tubingen, GermanyBaylor College of Medicine, Houston, TX, USAMatthias BethgeCentre for Integrative Neuroscience, University of Tubingen, GermanyBernstein Center for Computational Neuroscience, Tubingen, GermanyMax Planck Institute for Biological Cybernetics, Tubingen, Germany AbstractHere we introduce a new model of natural textures based on the feature spacesof convolutional neural networks optimised for object recognition. Samples fromthe model are of high perceptual quality demonstratin



A Gram matrix is a type of matrix that is used in linear algebra to represent a inner product between vectors.  The inner product is computed from a set of vectors, called a basis, and a matrix that represents the basis.  The Gram matrix is used to represent the linear relationship between the basis vectors.  For example, the inner product of two vectors can be represented by the dot product of the corresponding vectors.  This dot product is computed by first applying the corresponding basis matrix to the two vectors.  The result of this computation is a number.  This number is called the inner product of the two vectors.  The inner product of two vectors can also be represented by a dot product of the two Gram matrices of the basis vectors.  The Gram matrix of a basis is a square matrix with the same dimensions as the basis.  The inner product of two vectors is the dot product of the corresponding vectors and the basis.  The inner product of two vectors can also be represented by a do