In [1]:
import pandas as pd
import requests


### Downloading the data

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# converting to dataframe
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

In [5]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


Vector spaces - we turn the document into a vector with encodings. 
for each word in our dictionary we have a column.
- term document matrix
    - rows: documents
    -columns: words/tokens

In scikit-learn theres a library that does this called CountVectorizer





In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=5) # only care about terms that appear in 5 documents (to avoid questions that are rrely asked or in nonEglish languages)
cv.fit(df.text)
cv.get_feature_names_out().shape

(1524,)

In [7]:
cv.get_feature_names_out()


array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [8]:
documents_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

cv = CountVectorizer(stop_words='english') # removes some of the fill words in english
cv.fit(documents_examples)
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [9]:
X = cv.transform(documents_examples)
X.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [10]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out()).T
# this representation is called Bag of Words
# we dont care about word order! only about the presence of the word. sparse matrices (meaning most of the values are 0)

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
# Tfidf gives more importance to less frequent terms

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)
names = cv.get_feature_names_out()
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Tfidf gives more importance to less frequent terms

tf = TfidfVectorizer(stop_words='english', min_df=5)
X = tf.fit_transform(df.text)
names = tf.get_feature_names_out()
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
02,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
03,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
04,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
05,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.000000,0.279891,0.0,0.0,0.000000,0.205982,0.201973,0.153896,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
yml,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.107298,0.0,0.0,0.0,0.000000
youtube,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.153475,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
zip,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000


instead of 1s and 0s like in the countVectorizer, this assigns a float value between 0 and 1.

In [13]:
query = "Do I need to know python to sign up for the January course?"

q = tf.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [14]:
query_dict = dict(zip(names, q.toarray()[0]))
len(query_dict)

1333

In [15]:
doc_dict = dict(zip(names, X.toarray()[1])) # only document 1
len(doc_dict)

1333

In [16]:
# if we look at the word course in the query:
query_dict['course']

0.38148200594064524

In [17]:
doc_dict['course']

0.0

We multiply query_dict[key] * doc_dict[key], and then sum them all together, and we get a number that represents the similarity between that document and the query. the higher the similarity is, the more relevant it will be to our query. Essentially, we are taking the dot product

In [18]:
query = "Do I need to know python to sign up for the January course?"

q = tf.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [19]:
X.dot(q.T) #cosine similarity


<948x1 sparse matrix of type '<class 'numpy.float64'>'
	with 294 stored elements in Compressed Sparse Row format>

In [20]:
# X.dot(q.T).todense()

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

score = cosine_similarity(X, q).flatten()

In [22]:
score

array([0.19464486, 0.        , 0.        , 0.06011641, 0.04932915,
       0.        , 0.        , 0.13477565, 0.        , 0.        ,
       0.        , 0.15899187, 0.        , 0.        , 0.        ,
       0.07431408, 0.        , 0.        , 0.05779673, 0.07243428,
       0.        , 0.05174293, 0.16373635, 0.08076031, 0.        ,
       0.09755254, 0.        , 0.21069625, 0.12067781, 0.        ,
       0.        , 0.        , 0.        , 0.06381749, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00910541,
       0.02835681, 0.05480112, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02469964, 0.05129386, 0.06013439,
       0.05252658, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04169018, 0.        , 0.        , 0.        , 0.0075293 ,
       0.        , 0.        , 0.01971463, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [23]:
import numpy as np
np.argsort(score)[-5:] # sorts from lowest to highest, so we need the last ones

array([764,  27, 806, 577, 445])

In [24]:
query = "I just discovered the course, is it too late to join?"

q = tf.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [25]:
score = cosine_similarity(X, q).flatten()

In [26]:
np.argsort(score)[-5:]

array([ 22, 448, 449, 440,   0])

In [27]:
df.iloc[22].text


"It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."

In [28]:
df.iloc[448].text

"Here’s how you join a in Slack: https://slack.com/help/articles/205239967-Join-a-channel\nClick “All channels” at the top of your left sidebar. If you don't see this option, click “More” to find it.\nBrowse the list of public channels in your workspace, or use the search bar to search by channel name or description.\nSelect a channel from the list to view it.\nClick Join Channel.\nDo we need to provide the GitHub link to only our code corresponding to the homework questions?\nYes. You are required to provide the URL to your repo in order to receive a grade"

In [29]:
df.iloc[449].text

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

In [30]:
fields = ['section', 'question', 'text']
matrices = {}
vectorizers = {}

for f in fields:
    tf = TfidfVectorizer(stop_words='english', min_df=5)
    X = tf.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = tf

In [31]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [39]:
n = len(df)
score = np.zeros(n)

boosts = {
    'question': 3,
    'text': 0.5
}
# giving the question a boost, so 'question' field will be 3x more important than other fields


for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]
    f_score = cosine_similarity(X, q).flatten()

    boost = boosts.get(f, 1.0)

    score = score + boost*f_score 

In [40]:
score

array([3.28960182, 3.49512426, 2.70735166, 2.86194785, 3.49512426,
       3.49512426, 1.93689291, 3.58291062, 2.67242848, 3.49512426,
       3.10198469, 2.38161407, 0.49512426, 0.49512426, 0.49512426,
       0.54352887, 0.49512426, 2.63772182, 0.53277026, 0.49512426,
       0.49512426, 0.49512426, 0.64505807, 0.54772763, 0.49512426,
       0.49512426, 0.49512426, 0.63236164, 0.55926629, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.76390631, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.49512426, 0.5109058 ,
       0.51969835, 1.9654566 , 0.49512426, 0.51677312, 0.        ,
       0.        , 0.        , 0.        , 0.01402187, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.03369519, 0.        , 0.00490423,
       0.        , 0.        , 0.        , 0.        , 0.02910051,
       0.        , 0.        , 0.        , 0.        , 0.     

In [42]:
idx = np.argsort(-score)[:5]
df.iloc[idx]

Unnamed: 0,course,section,question,text
448,machine-learning-zoomcamp,General course-related questions,I’m new to Slack and can’t find the course cha...,Here’s how you join a in Slack: https://slack....
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
453,machine-learning-zoomcamp,General course-related questions,What are the deadlines in this course?,"For the 2023 cohort, you can see the deadlines..."


In [43]:
# we need to filter to a particular course
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask
score

array([3.28960182, 3.49512426, 2.70735166, 2.86194785, 3.49512426,
       3.49512426, 1.93689291, 3.58291062, 2.67242848, 3.49512426,
       3.10198469, 2.38161407, 0.49512426, 0.49512426, 0.49512426,
       0.54352887, 0.49512426, 2.63772182, 0.53277026, 0.49512426,
       0.49512426, 0.49512426, 0.64505807, 0.54772763, 0.49512426,
       0.49512426, 0.49512426, 0.63236164, 0.55926629, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.76390631, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.49512426, 0.5109058 ,
       0.51969835, 1.9654566 , 0.49512426, 0.51677312, 0.        ,
       0.        , 0.        , 0.        , 0.01402187, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.03369519, 0.        , 0.00490423,
       0.        , 0.        , 0.        , 0.        , 0.02910051,
       0.        , 0.        , 0.        , 0.        , 0.     

In [46]:
idx = np.argsort(-score)[:5]

In [47]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...


## Putting it all together

In [48]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            tf = TfidfVectorizer(**vectorizer_params)
            X = tf.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = tf

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [49]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin