In [23]:
import pandas
import requests

In [24]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [25]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


# Keyword based search

In [27]:
df[df["course"]=="data-engineering-zoomcamp"].head(2)

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...


# Vectorization

In [28]:
documents = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course",
    " All the best for the course"
]

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(documents)
names = cv.get_feature_names_out()
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5
15th,1,0,0,0,0,0
2024,1,0,0,0,0,0
best,0,0,0,0,0,1
cloud,0,0,0,0,1,0
course,1,0,0,0,1,1
date,0,0,1,0,0,0
github,0,1,0,0,0,0
google,0,0,0,0,1,0
homeworks,0,0,1,0,0,0
jan,1,0,0,0,0,0


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(documents)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5
15th,0.47,0.0,0.0,0.0,0.0,0.0
2024,0.47,0.0,0.0,0.0,0.0,0.0
best,0.0,0.0,0.0,0.0,0.0,0.82
cloud,0.0,0.0,0.0,0.0,0.47,0.0
course,0.33,0.0,0.0,0.0,0.33,0.57
date,0.0,0.0,0.5,0.0,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.47,0.0
homeworks,0.0,0.0,0.5,0.0,0.0,0.0
jan,0.47,0.0,0.0,0.0,0.0,0.0


### Then queries will also be represented using the same vectorizer

In [31]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.56921261,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.82219037, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [32]:
names

array(['15th', '2024', 'best', 'cloud', 'course', 'date', 'github',
       'google', 'homeworks', 'jan', 'listed', 'participation',
       'prerequisites', 'python', 'registration', 'required', 'setup',
       'start', 'starts', 'submit'], dtype=object)

In [33]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict


{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'best': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.5692126078464125),
 'date': np.float64(0.0),
 'github': np.float64(0.0),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.0),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.0),
 'python': np.float64(0.8221903715494888),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [34]:
q.toarray()[0].shape

(20,)

In [35]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'best': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.5773502691896257),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.5773502691896257),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.5773502691896257),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [36]:
doc_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'best': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.5773502691896257),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.5773502691896257),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.5773502691896257),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [37]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

In [38]:
df_qd

Unnamed: 0,query,doc
15th,0.0,0.0
2024,0.0,0.0
best,0.0,0.0
cloud,0.0,0.0
course,0.569213,0.0
date,0.0,0.0
github,0.0,0.57735
google,0.0,0.0
homeworks,0.0,0.0
jan,0.0,0.0


In [39]:
(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.0)

In [40]:
X.dot(q.T).toarray()

array([[0.18619659],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.57467553],
       [0.32400299]])

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
res_cos=cosine_similarity(X, q)

In [42]:
res_cos.flatten()

array([0.18619659, 0.        , 0.        , 0.        , 0.57467553,
       0.32400299])

The output you provided (array([[0.18619659], [0. ], [0. ], [0. ], [0.57467553], [0.32400299]])) suggests that you have calculated cosine similarities for several pairs of vectors.

In [43]:
res_df=pd.DataFrame({"query":query,"docs":documents,"similarity":res_cos.flatten()})

In [44]:
res_df

Unnamed: 0,query,docs,similarity
0,Do I need to know python to sign up for the Ja...,Course starts on 15th Jan 2024,0.186197
1,Do I need to know python to sign up for the Ja...,Prerequisites listed on GitHub,0.0
2,Do I need to know python to sign up for the Ja...,Submit homeworks after start date,0.0
3,Do I need to know python to sign up for the Ja...,Registration not required for participation,0.0
4,Do I need to know python to sign up for the Ja...,Setup Google Cloud and Python before course,0.574676
5,Do I need to know python to sign up for the Ja...,All the best for the course,0.324003


# Vectorizing all the documents

min_df=3: This parameter specifies that a term (word) must appear in at least 3 different documents (or texts) to be considered for inclusion in the vocabulary

In [54]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

print(transformers['text'].get_feature_names_out())
matrices['text']

['001' '01' '02' ... 'zones' 'zoom' 'zoomcamp']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

In [59]:
transformers["text"].get_feature_names_out()

array(['001', '01', '02', ..., 'zones', 'zoom', 'zoomcamp'], dtype=object)

In [60]:
transformers

{'section': TfidfVectorizer(min_df=3, stop_words='english'),
 'question': TfidfVectorizer(min_df=3, stop_words='english'),
 'text': TfidfVectorizer(min_df=3, stop_words='english')}

In [74]:
query = "are late submission allowed?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [75]:
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [76]:
score

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.45875925, 0.        , 0.        , 0.12564071, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.14841476, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [77]:
mask = (df.course == 'data-engineering-zoomcamp').values

In [78]:
score = score * mask

np.argsort: This function returns the indices that would sort an array in ascending order.
score: Here, -score negates the score array. Sorting with -score effectively sorts in descending order because argsort sorts in ascending order by default.
[:10]: This slicing notation selects the first 10 elements from the sorted indices.

In [81]:
import numpy as np

idx = np.argsort(-score)[:1]

In [84]:
df.iloc[idx].text.values

array(['No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]'],
      dtype=object)

In [85]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [87]:
pd.DataFrame(score)

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
943,0.0
944,0.0
945,0.0
946,0.0


In [88]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [89]:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Homework - Are late submissions of homework allowed?',
  'text': 'No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Project - What is Project Attemp #1 and Project Attempt #2 exactly?',
  'text': 'You will have two attempts for a project. If the first project deadline is over and you’re late or you submit the project and fail the first attempt, you have another chance to submit the project with the second attempt.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Leaderboard - I am not on the leaderboard / how do I know which one I am o

# SVD

In [90]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.08799882, -0.07488189, -0.10079973,  0.0510524 ,  0.05616879,
       -0.06279214,  0.02022308,  0.04794348, -0.19886301,  0.33966346,
        0.06476396,  0.10088612,  0.08558236,  0.10063436,  0.02675178,
       -0.04585441])

In [96]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.04353619, -0.03068428, -0.04405571,  0.01170707,  0.02696884,
       -0.05206852,  0.01279189,  0.02730423, -0.11288842,  0.18460511,
        0.05231042,  0.07324509,  0.04739405,  0.05120643,  0.02015225,
       -0.01442465])

In [97]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.12385930728290773)

In [98]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:1]
list(df.loc[idx].text)

['Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.']

# Non-Negative Matrix Factorization

In [100]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.31335156,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [101]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.0012548 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17560933,
       0.        , 0.        , 0.        , 0.        , 0.00077611,
       0.        ])

In [103]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:1]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."]

# BERT

In [104]:
!pip install transformers tqdm

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m451.8 kB/s[0m eta [36m0:00:00[0m1m703.6 kB/s[0m eta [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting numpy<2.0,>=1.17 (from transformers)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m[31m3.3 MB/s[0m eta [36m0:00:01[0m
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m814.9 kB/s[0m eta [36m0:00:00