## 1. Download the data:

In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Creating the dataframe:

In [2]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [3]:
df[df.course == 'data-engineering-zoomcamp'].head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


Vector Spaces:

- Turn the docs into vectors
- term-document matrix:
    - rows: documents
    - columns: words/tokens
- Bag of words:
    - word order is lost
    - sparse matrix

### Vectorization
For Count Vectorizer and TF-IDF we will first use a simple example

In [4]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date in Python",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

Let's use a count vectorizer first:

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)
X.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [6]:
names = cv.get_feature_names_out()
names

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [7]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out())

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


In [8]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


This representation is called "bag of words" - here we ignore the order of words, just focus on the words themselves. In many cases this is sufficient and gives pretty good results already.

Now let's replace it with `TfidfVectorizer`:

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.46,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.48
course,0.37,0.0,0.0,0.0,0.39
date,0.0,0.0,0.46,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.48
homeworks,0.0,0.0,0.46,0.0,0.0
jan,0.46,0.0,0.0,0.0,0.0
listed,0.0,0.58,0.0,0.0,0.0


Implementing in our datasets

In [10]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00000000e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
斜杠,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
查找和替换,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
要了解键盘快捷键,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
要启用屏幕阅读器支持,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
#query = "Do I need to know python to sign up for the January course?"
query = "I just discovered the course, is it too late to join?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [28]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'00': np.float64(0.0),
 '00000000e': np.float64(0.0),
 '0002': np.float64(0.0),
 '00021': np.float64(0.0),
 '001': np.float64(0.0),
 '009s': np.float64(0.0),
 '01': np.float64(0.0),
 '02': np.float64(0.0),
 '020': np.float64(0.0),
 '028879': np.float64(0.0),
 '02d': np.float64(0.0),
 '03': np.float64(0.0),
 '0315': np.float64(0.0),
 '04': np.float64(0.0),
 '04d': np.float64(0.0),
 '05': np.float64(0.0),
 '051': np.float64(0.0),
 '054': np.float64(0.0),
 '06': np.float64(0.0),
 '06_spark_sql': np.float64(0.0),
 '07': np.float64(0.0),
 '07cd': np.float64(0.0),
 '08': np.float64(0.0),
 '09': np.float64(0.0),
 '0ms': np.float64(0.0),
 '0x3c947bc5': np.float64(0.0),
 '0x7efe331cf790': np.float64(0.0),
 '0x7f797010a590': np.float64(0.0),
 '0x7fbaf2666280': np.float64(0.0),
 '0x800701bc': np.float64(0.0),
 '0xa0': np.float64(0.0),
 '0xff': np.float64(0.0),
 '0zw04wdetqo': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '1000': np.float64(0.0),
 '100000': np.float64(0.0),
 

In [29]:
doc_dict = dict(zip(names, X.toarray()[2]))
doc_dict

{'00': np.float64(0.0),
 '00000000e': np.float64(0.0),
 '0002': np.float64(0.0),
 '00021': np.float64(0.0),
 '001': np.float64(0.0),
 '009s': np.float64(0.0),
 '01': np.float64(0.0),
 '02': np.float64(0.0),
 '020': np.float64(0.0),
 '028879': np.float64(0.0),
 '02d': np.float64(0.0),
 '03': np.float64(0.0),
 '0315': np.float64(0.0),
 '04': np.float64(0.0),
 '04d': np.float64(0.0),
 '05': np.float64(0.0),
 '051': np.float64(0.0),
 '054': np.float64(0.0),
 '06': np.float64(0.0),
 '06_spark_sql': np.float64(0.0),
 '07': np.float64(0.0),
 '07cd': np.float64(0.0),
 '08': np.float64(0.0),
 '09': np.float64(0.0),
 '0ms': np.float64(0.0),
 '0x3c947bc5': np.float64(0.0),
 '0x7efe331cf790': np.float64(0.0),
 '0x7f797010a590': np.float64(0.0),
 '0x7fbaf2666280': np.float64(0.0),
 '0x800701bc': np.float64(0.0),
 '0xa0': np.float64(0.0),
 '0xff': np.float64(0.0),
 '0zw04wdetqo': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '1000': np.float64(0.0),
 '100000': np.float64(0.0),
 

In [30]:
# producto punto entre el query y todo los documentos
X.dot(q.T).todense()

matrix([[0.31472128],
        [0.        ],
        [0.        ],
        [0.09783826],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.0965754 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09825566],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.22644529],
        [0.        ],
        [0.        ],
        [0.04166116],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.22668   ],
        [0.06983386],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.17306446],
        [0.07714984],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.03378114],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.16484429],
        [0.01978044],
        [0.03055563],
        [0.04847941],
        [0.        ],
        [0.02638028],
        [0.        ],
        [0

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
score = cosine_similarity(X, q).flatten() # Similar a producto punto

In [33]:
import numpy as np

In [35]:
np.argsort(score)[-5:]

array([440,  15,  22, 449,   0])

In [40]:
df.iloc[449].text

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

In [41]:
df.columns

Index(['course', 'section', 'question', 'text'], dtype='object')

In [42]:
fields = ['section', 'question', 'text']

In [52]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words="english", min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv   

In [53]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [54]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [55]:
n = len(df)

In [70]:
score = np.zeros(n)
query = "I just discovered the course, is it too late to join?"


boosts = {
    'question': 3,
    #'text': 0.5
    
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]
    
    f_score = cosine_similarity(X,q).flatten()
    
    boost = boosts.get(f, 1.0)
    
    score = score + f_score  
    

In [71]:
filters = {
    "course": "data-engineering-zoomcamp",
    
}

In [72]:
for field, value in filters.items():
    
    mask = (df[field] == value).astype(int).values
    score = score * mask

In [73]:
#idx = np.argsort(score)[-5:]
idx = np.argsort(-score)[:5]

In [74]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...


In [78]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [79]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin