In [None]:
!pip install -q transformers==4.41.2

In [None]:
!pip install -q sentence-transformers==2.2.2
#!pip install -q xformers==0.0.23
!pip install -q chromadb==0.4.20

In [None]:
import numpy as np
import pandas as pd

In [None]:
news = pd.read_csv('labelled_newscatcher_dataset.csv', sep=';')
MAX_NEWS = 1000
DOCUMENT="title"
TOPIC="topic"


ChromaDB requires that the data has a unique identifier. We can make it with this statement, which will create a new column called **Id**.


In [None]:
news["id"] = news.index
news.head()

In [None]:
subset_news = news.head(MAX_NEWS)

In [None]:
import chromadb
from chromadb.config import Settings

In [None]:
chroma_client = chromadb.PersistentClient(path="/path/to/persist/directory")

In [None]:
from datetime import datetime

In [None]:
collection_name = "news_collection"+datetime.now().strftime("%s")
if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:
        chroma_client.delete_collection(name=collection_name)

collection = chroma_client.create_collection(name=collection_name)


In [None]:
collection.add(
    documents=subset_news[DOCUMENT].tolist(),
    metadatas=[{TOPIC: topic} for topic in subset_news[TOPIC].tolist()],
    ids=[f"id{x}" for x in range(MAX_NEWS)],
)

In [None]:
results = collection.query(query_texts=["laptop"], n_results=10 )

print(results)

## Vector MAP

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
getado = collection.get(ids="id141",
                       include=["documents", "embeddings"])


In [None]:
word_vectors = getado["embeddings"]
word_list = getado["documents"]
word_vectors

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
lm_model = AutoModelForCausalLM.from_pretrained(model_id)

In [None]:
pipe = pipeline(
    "text-generation",
    model=lm_model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    device_map="auto",
)

In [None]:
question = "Can I buy a new Toshiba laptop?"
context = " ".join([f"#{str(i)}" for i in results["documents"][0]])
prompt_template = f"""
Relevant context: {context}
Considering the relevant context, answer the question.
Question: {question}
Answer: """
prompt_template

In [None]:
lm_response = pipe(prompt_template)
print(lm_response[0]["generated_text"])