## Goal: Build a search engine that searches through the FAQ document
The search engine should also be able to sort the search results by relevance (most to least)

In [2]:
import requests
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import BertModel, BertTokenizer
from tqdm import tqdm

### Import the FAQ data

In [3]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
# sample text-section-question unnested from "course" JSON tag
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

### Convert JSON to a pandas dataframe

In [5]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [6]:
df.tail()

Unnamed: 0,course,section,question,text
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...
947,mlops-zoomcamp,Module 6: Best practices,How to destroy infrastructure created via GitH...,Problem description\nInfrastructure created in...


### Limit search results to only within the course "data-engineering-zoomcamp"

In [7]:
# example to do filtering in pandas df
df[df.course == 'data-engineering-zoomcamp'].head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [8]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

## Section 1: Text Search
Search based on text, which does not take into account the semantic similarities of different words 

Vector spaces

- turn the docs (text) into vectors
- term-document matrix:
    - rows: documents
    - columns: words/tokens
- bag of words:
    - disregard word order. Only the appearance of word matters.
    - sparse matrix.

CountVectorizer "convert a collection of text documents to a matrix of token counts", i.e. vectorizes a collection document by the terms within into a sparse matrix. Each row represents a document, each column represents a word/token found in the collection. If a row-column has a value of 1, it means this word/token is found within a particular document.

In [9]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


Similar to CountVectorizer, TfidfVectorizer vectorizes a collection document by the terms within into a sparse matrix. However, the scoring matrix used for each row of document looks at the frequency of each word/token within the document.<br>
A word that appears frequently is treated similarly to a stop word. Hence if a word appears less in a document, the score is higher as it is deemed more important.

In [10]:
cv = TfidfVectorizer(stop_words='english', min_df=5)
# X = cv.fit_transform(docs_example)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


### Vectorize query

In [11]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [12]:
# example of vectorized query
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [13]:
# example of vectorized doc
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

### Compare similarity between query and doc by taking the dot product between their weights

In [14]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.0)

In [15]:
X.dot(q.T).toarray()

array([[0.19464486],
       [0.        ],
       [0.        ],
       [0.06011641],
       [0.04932915],
       [0.        ],
       [0.        ],
       [0.13477565],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.15899187],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.07431408],
       [0.        ],
       [0.        ],
       [0.05779673],
       [0.07243428],
       [0.        ],
       [0.05174293],
       [0.16373635],
       [0.08076031],
       [0.        ],
       [0.09755254],
       [0.        ],
       [0.21069625],
       [0.12067781],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.06381749],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00910541],
       [0.02835681],
       [0.05480112],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.024

### Cosine Similarity
In practice, we usually use cosine similarity. It produces a 2D array which can be flattened into a 1D matrix of similarity scores.

In [16]:
score = cosine_similarity(X, q).flatten()
print("Indices with the top 5 similarity score:")
np.argsort(score)[-5:]

Indices with the top 5 similarity score:


array([764,  27, 806, 577, 445])

### Comparing query with the top 5 results
Notice that the search only based on words similarity for relevancy hence some top results do not really answer the question

In [17]:
print(query)
df.iloc[806].text

Do I need to know python to sign up for the January course?


'Technically, yes. Advisable? Not really. Reasons:\nSome homework(s) asks for specific python library versions.\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\ntx[source]'

### Vectorise using multiple fields for better similarity score generation and relevancy

In [18]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [19]:
query = "I just signed up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()
print("Indices with the top 5 similarity score:")
np.argsort(score)[-5:]

Indices with the top 5 similarity score:


array([ 22, 448, 449, 440,   0])

In [20]:
# compare the new results with the query
print(query)
df.iloc[448].text

I just signed up. Is it too late to join the course?


"Here’s how you join a in Slack: https://slack.com/help/articles/205239967-Join-a-channel\nClick “All channels” at the top of your left sidebar. If you don't see this option, click “More” to find it.\nBrowse the list of public channels in your workspace, or use the search bar to search by channel name or description.\nSelect a channel from the list to view it.\nClick Join Channel.\nDo we need to provide the GitHub link to only our code corresponding to the homework questions?\nYes. You are required to provide the URL to your repo in order to receive a grade"

### Filter dataframe to limit results to only from the course "data-engineering-zoomcamp"

In [21]:
# generate a mask of 0 and 1 using a filtering criteria
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask

In [22]:
idx = np.argsort(-score)[:10]

In [23]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
287,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,CREATE TABLE has columns with duplicate name l...,This error could result if you are using some ...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
113,data-engineering-zoomcamp,Module 1: Docker and Terraform,"Postgres - ""Column does not exist"" but it actu...","In the join queries, if we mention the column ..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
28,data-engineering-zoomcamp,General course-related questions,Environment - I want to use AWS. May I do that?,"Yes, you can. Just remember to adapt all the i..."
148,data-engineering-zoomcamp,Module 1: Docker and Terraform,Terraform - Do I need to make another service ...,One service account is enough for all the serv...


### Tweaking importance of field "question" to give it more relevance to the query
Boost "question" with 3x more importance than other fields

In [24]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [25]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [26]:
idx = np.argsort(-score)[:5]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcem

### Putting it all together

Combining all the methods used previously into a single class, where it:
- creates an instance of TextSearch class of fields we want to vectorise.
- fits specific fields from the FAQ documents into a vectorizer.
- performs a search based on query with options to boost importance for specific fields.

In [27]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [28]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

## Section 2: Vector Search
Search based on semantic similarities of different words

### What are Embeddings?
- Conversion to Numbers: Embeddings transform different words, sentences and documents into dense vectors (arrays with numbers).
- Capturing Similarity: They ensure similar items have similar numerical vectors, illustrating their closeness in terms of characteristics.
- Dimensionality Reduction: Embeddings reduce complex characteristics into vectors.
- Use in Machine Learning: These numerical vectors are used in machine learning models for tasks such as recommendations, text analysis, and pattern recognition.

### SVD
Singular Value Decomposition is the simplest way to turn Bag-of-Words representation into embeddings

This way we still don't preserve the word order (because it wasn't in the Bag-of-Words representation) but we reduce dimensionality and capture synonyms.

We won't go into mathematics, it's sufficient to know that SVD "compresses" our input vectors in such a way that as much as possible of the original information is retained.

This compression is lossy compression - meaning that we won't be able to restore the 100% of the original vector, but the result is close enough.

In [29]:
X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.09653074, -0.08233057, -0.10110293, -0.07838385,  0.06698649,
       -0.06213829,  0.01884405, -0.16418254, -0.23440696,  0.27849614,
        0.05038335,  0.03421134, -0.11498112,  0.11085739,  0.01052434,
       -0.04832532])

In [30]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.05790251, -0.03861363, -0.05534069, -0.02790531,  0.03848932,
       -0.06424028,  0.00917504, -0.10714869, -0.16278107,  0.17817755,
        0.04562127,  0.03989174, -0.07375791,  0.07767612,  0.03011026,
        0.00141538])

In [31]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.14966807974364665)

In [32]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts 

### Non-Negative Matrix Factorization
SVD creates values with negative numbers. It's difficult to interpet them.

NMF (Non-Negative Matrix Factorization) is a similar concept, except for non-negative input matrices it produces non-negative results.

We can interpret each of the columns (features) of the embeddings as different topic/concents and to what extent this document is about this concept.

Let's use it for the documents:

In [33]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.12916784, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [34]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.08535461, 0.00224518, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00176301,
       0.        ])

In [35]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigore

### BERT
The problem with the previous two approaches is that they don't take into account the word order. They just treat all the words separately (that's why it's called "Bag-of-Words")

BERT and other transformer models don't have this problem.

Let's create embeddings with BERT. We will use the Hugging Face library for that.

In [36]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

We need:

- tokenizer: for turning text into vectors
- model: for compressing the text into embeddings

First, we tokenize the text

In [37]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

Then we compute the embeddings:

In [38]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [40]:
hidden_states[0].shape

torch.Size([15, 768])

Now we need to compress the embeddings:

In [41]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [42]:
# 2 rows of embeddings for the 2 rows in texts
sentence_embeddings

tensor([[ 0.3600, -0.1607,  0.3545,  ...,  0.0429,  0.0348, -0.0382],
        [ 0.1785, -0.5000,  0.2528,  ..., -0.1141, -0.3361,  0.4110]])

And convert them to a numpy array

In [43]:
X_emb = sentence_embeddings.numpy()

In [44]:
sentence_embeddings_cpu = sentence_embeddings.cpu()

In [45]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [48]:
texts = df['text'].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state
        
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

100%|█████████████████████████████████████████| 119/119 [01:29<00:00,  1.33it/s]


In [49]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [50]:
X_text = compute_embeddings(df['text'].tolist())

100%|█████████████████████████████████████████| 119/119 [01:26<00:00,  1.37it/s]
