<a href="https://colab.research.google.com/github/vokativ/rag_demo_qb/blob/main/rag_demo/handson_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 1 - RAG
# 1. Vector Embedding and Storage

In [4]:
!pip install faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [5]:
# Load embedding model (runs locally)
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Sample documents
documents = [
    "The Eiffel Tower is in Paris.",
    "The capital of France is Paris.",
    "Python is a popular programming language.",
    "Machine learning enables AI applications.",
    "Sentiment analysis helps understand customer feedback.",
    "Stock market predictions are complex and data-driven."
]

In [7]:
# Generate embeddings
embeddings = model.encode(documents)

In [8]:
# Store embeddings in FAISS
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

print("Embeddings stored in FAISS index.")

Embeddings stored in FAISS index.


# 2. Information Retrieval


In [15]:
# Query
#query = "Where is the Eiffel Tower?"
query = "What are some popular programming languages?"
query_embedding = model.encode([query])

In [16]:
# Search top-1 nearest neighbor
distances, indices = index.search(np.array(query_embedding), k=1)
retrieved_doc = documents[indices[0][0]]

print("Retrieved document:", retrieved_doc)

Retrieved document: Python is a popular programming language.


# Day 2 - Scaling and Serving
# 1. Large-scale Data Labelling

In [None]:
from transformers import pipeline

# Load a local LLM (GPT4All, Llama2, etc.)
sentiment_pipeline = pipeline("sentiment-analysis") # Using a sentiment analysis model to label data

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [None]:
senti_documents = [
    "I am blown away! This product is so great!",
    "If given a choice, I would never come back again",
    "It is fine. Some slow parts and some good moments in the show",
    "Oh great, another softwware update that breaks everything!"
]
sentiments = []
# Loop through each document and analyze sentiment
for doc in senti_documents:
    sentiment_result = sentiment_pipeline(doc)
    # Append the result (label and score) to the sentiments list
    sentiments.append(sentiment_result[0]['label'])

# Output the sentiment for each document
for doc, sentiment in zip(senti_documents, sentiments):
    print(f"Document: {doc}\nSentiment: {sentiment}\n")

Document: I am blown away! This product is so great!
Sentiment: POSITIVE

Document: If given a choice, I would never come back again
Sentiment: NEGATIVE

Document: It is fine. Some slow parts and some good moments in the show
Sentiment: POSITIVE

Document: Oh great, another softwware update that breaks everything!
Sentiment: POSITIVE



In [None]:
# Initialize the text classification pipeline (using a pre-trained news classification model)
# We will use "distilbert-base-uncased" fine-tuned for topic classification or any other suitable model
topic_classifier = pipeline("zero-shot-classification")

# List of news articles
news_documents = [
    "The stock market surged today as investors responded positively to new economic policies.",
    "The local football team has won the championship after a nail-biting final match.",
    "New advancements in artificial intelligence are revolutionizing industries worldwide.",
    "The government has introduced new policies aimed at addressing climate change."
]

# Define possible categories (topics) for classification
candidate_labels = ["Politics", "Technology", "Sports", "Business", "Health", "Entertainment"]

# Store the predicted topics
topics = []

# Loop through each document and classify it
for doc in news_documents:
    topic_result = topic_classifier(doc, candidate_labels=candidate_labels)
    # Append the predicted topic to the topics list
    topics.append(topic_result['labels'][0])  # We use the highest probability label

# Output the topic for each document
for doc, topic in zip(news_documents, topics):
    print(f"Document: {doc}\nPredicted Topic: {topic}\n")

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


Document: The stock market surged today as investors responded positively to new economic policies.
Predicted Topic: Business

Document: The local football team has won the championship after a nail-biting final match.
Predicted Topic: Sports

Document: New advancements in artificial intelligence are revolutionizing industries worldwide.
Predicted Topic: Technology

Document: The government has introduced new policies aimed at addressing climate change.
Predicted Topic: Politics

