In [1]:
import requests 

# How was this converted to json
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [124]:
import pandas as pd
# Set the maximum number of rows and columns to display
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [87]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()


Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [88]:
# df[df.course == 'data-engineering-zoomcamp'].head()
df = df[df.course == 'data-engineering-zoomcamp']
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [89]:
# docs_example = [
#     "January course details, register now",
#     "Course prerequisites listed in January catalog",
#     "Submit January course homework by end of month",
#     "Register for January course, no prerequisites",
#     "January course setup: Python and Google Cloud"
# ]

# Vector spaces
- turn the docs into vectors
- term-document matrix:
  - rows: documents
  - columns: words/tokens
- BAG of words
  - order of words doesn't matter
  - only the frequency of words
  - sparse matrices. Most of the value are Zeros

In [90]:
from sklearn.feature_extraction.text import CountVectorizer # This library is used to create a vector space

In [91]:
cv = CountVectorizer()

In [92]:
# cv.fit(docs_example) # Load just the example documents

In [93]:
cv.fit(df.text) # Fit with an the Text from the documents that rapresent the answers

In [109]:
names = cv.get_feature_names_out()
names

array(['01', '04', '05', '10', '100', '11', '12', '13', '16', '17', '2019', '2024', '22', '24', '403', '5431', '5432', '7077', '80', '8080', 'able', 'about', 'above', 'access', 'account', 'across',
       'activate', 'add', 'added', 'adding', 'additional', 'address', 'admin', 'after', 'again', 'airflow', 'alexey', 'all', 'allows', 'already', 'also', 'alternative', 'alternatively', 'am', 'an',
       'anaconda', 'anaconda3', 'analytics', 'anand', 'and', 'another', 'ans', 'answer', 'any', 'apache', 'api', 'app', 'appear', 'appears', 'append', 'application', 'apply', 'appname', 'apt',
       'archives', 'are', 'argument', 'as', 'ask', 'assigned', 'at', 'attempting', 'auth', 'authentication', 'automatically', 'available', 'avoid', 'azure', 'back', 'bad', 'base', 'based', 'bash',
       'bashrc', 'be', 'because', 'been', 'before', 'being', 'below', 'best', 'better', 'between', 'bigquery', 'bin', 'binary', 'bit', 'blob', 'block', 'both', 'bq', 'branch', 'browser', 'bucket',
       'build', '

In [95]:
names.shape

(4074,)

We can see that we have too many words in the term-document matrix. We can reduce the number of words by:
- including only the words that appears at least in 5 documents

In [110]:
cv = CountVectorizer(min_df=5)
cv.fit(df.text) # Fit with an the Text from the documents that rapresent the answers
names = cv.get_feature_names_out()
names.shape
# Now we have fewer words

(883,)

In [111]:
names

array(['01', '04', '05', '10', '100', '11', '12', '13', '16', '17', '2019', '2024', '22', '24', '403', '5431', '5432', '7077', '80', '8080', 'able', 'about', 'above', 'access', 'account', 'across',
       'activate', 'add', 'added', 'adding', 'additional', 'address', 'admin', 'after', 'again', 'airflow', 'alexey', 'all', 'allows', 'already', 'also', 'alternative', 'alternatively', 'am', 'an',
       'anaconda', 'anaconda3', 'analytics', 'anand', 'and', 'another', 'ans', 'answer', 'any', 'apache', 'api', 'app', 'appear', 'appears', 'append', 'application', 'apply', 'appname', 'apt',
       'archives', 'are', 'argument', 'as', 'ask', 'assigned', 'at', 'attempting', 'auth', 'authentication', 'automatically', 'available', 'avoid', 'azure', 'back', 'bad', 'base', 'based', 'bash',
       'bashrc', 'be', 'because', 'been', 'before', 'being', 'below', 'best', 'better', 'between', 'bigquery', 'bin', 'binary', 'bit', 'blob', 'block', 'both', 'bq', 'branch', 'browser', 'bucket',
       'build', '

In [98]:
# X = cv.transform(docs_example)

In [115]:
X = cv.transform(df.text)
# df[df['course'] == 'data-engineering-zoomcamp']
X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 17845 stored elements and shape (435, 883)>

In [120]:
X.toarray() # THis is showing also all the zero values but we are not interested in them. We are interested in the non-zero values therefore we will transform this into a dense matrix

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
         0,  0,  0, 

In [121]:
X.todense()
# In the list names we have all the words that are in the documents.
# In the matrix X we have the frequency of each word in every row of the matrix. A row is a answer in our original dataset

matrix([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
          0,  0

In [125]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434
01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
100,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you,0,0,2,3,2,2,0,3,3,2,1,4,1,2,1,1,2,8,5,0,8,2,3,1,3,5,1,2,4,0,0,0,2,7,3,0,0,0,4,16,8,6,0,4,5,0,0,1,9,4,...,0,1,0,4,0,1,3,1,2,0,4,3,0,1,0,0,1,1,2,3,0,1,1,3,0,0,4,2,2,1,2,1,3,2,4,1,0,0,1,1,3,2,0,1,2,2,0,1,6,0
your,0,0,0,0,0,1,0,2,1,0,1,0,1,0,0,1,3,1,3,0,2,0,1,1,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,11,0,6,0,0,0,0,0,0,4,1,...,0,3,0,0,0,0,0,1,2,0,0,1,0,1,0,0,0,0,4,0,0,0,0,4,0,5,5,0,0,0,2,0,1,0,4,0,0,0,0,3,0,1,0,0,2,2,0,0,3,0
youtube,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [126]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out())

Unnamed: 0,01,04,05,10,100,11,12,13,16,17,2019,2024,22,24,403,5431,5432,7077,80,8080,able,about,above,access,account,across,activate,add,added,adding,additional,address,admin,after,again,airflow,alexey,all,allows,already,also,alternative,alternatively,am,an,anaconda,anaconda3,analytics,anand,and,...,vscode,want,was,way,we,web,website,week,wget,what,whatever,when,where,which,while,who,whole,why,will,window,windows,winpty,with,within,without,won,work,worked,worker,working,works,workshop,would,write,writing,wrong,wsl,wsl2,www,yaml,year,yellow,yellow_tripdata_2021,yes,yml,you,your,youtube,zip,zoomcamp
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,0,0,0
431,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
432,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
433,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,1,0,0,0,3,6,3,0,0,1


In [127]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out()).T # Transpose the matrix to read it better. I.E: Document 12 containts 1 "you", 1 "your" and 2 "youtube"

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434
01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
100,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you,0,0,2,3,2,2,0,3,3,2,1,4,1,2,1,1,2,8,5,0,8,2,3,1,3,5,1,2,4,0,0,0,2,7,3,0,0,0,4,16,8,6,0,4,5,0,0,1,9,4,...,0,1,0,4,0,1,3,1,2,0,4,3,0,1,0,0,1,1,2,3,0,1,1,3,0,0,4,2,2,1,2,1,3,2,4,1,0,0,1,1,3,2,0,1,2,2,0,1,6,0
your,0,0,0,0,0,1,0,2,1,0,1,0,1,0,0,1,3,1,3,0,2,0,1,1,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,11,0,6,0,0,0,0,0,0,4,1,...,0,3,0,0,0,0,0,1,2,0,0,1,0,1,0,0,0,0,4,0,0,0,0,4,0,5,5,0,0,0,2,0,1,0,4,0,0,0,0,3,0,1,0,0,2,2,0,0,3,0
youtube,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [133]:
df['text'].iloc[12] # Just as verification

'The zoom link is only published to instructors/presenters/TAs.\nStudents participate via Youtube Live and submit questions to Slido (link would be pinned in the chat when Alexey goes Live). The video URL should be posted in the announcements channel on Telegram & Slack before it begins. Also, you will see it live on the DataTalksClub YouTube Channel.\nDon’t post your questions in chat as it would be off-screen before the instructors/moderators have a chance to answer it if the room is very active.'

In [18]:
# cv = CountVectorizer(stop_words='english') # This will remove the stop words from the text like: in , no, of  etc that are not useful
# X = cv.fit_transform(docs_example)

# names = cv.get_feature_names_out()

# df_docs = pd.DataFrame(X.toarray(), columns=names).T
# df_docs

# THIS IS A BAG OF WORDS


In [140]:
cv = CountVectorizer(stop_words='english', min_df=5) # This will remove the stop words from the text like: in , no, of  etc that are not useful
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

# THIS IS A BAG OF WORDS

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434
01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
100,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0
youtube,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [135]:
from sklearn.feature_extraction.text import TfidfVectorizer # More imporrant words are the less frequent words.

In [22]:
# cv = TfidfVectorizer(stop_words='english')
# X = cv.fit_transform(docs_example)

# names = cv.get_feature_names_out()

# df_docs = pd.DataFrame(X.toarray(), columns=names).T
# # This is the term frequency inverse document frequency meaning that the words that are less frequent in the document will have a higher weight
# df_docs.round(2)

In [143]:
cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out() # List of available words in all the documents

df_docs = pd.DataFrame(X.toarray(), columns=names).T
# This is the term frequency inverse document frequency meaning that the words that are less frequent in the document will have a higher weight
# Higher weight means that the word is more important because is less frequent in the document. Is this calculated for the whole dataset or for each document?
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.08,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.22,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
10,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.03
100,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.22,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.17,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.34,0.0,0.0,0.00,0.25,0.31,0.18,0.00,0.0,0.0,0.00,0.34,0.0,0.0,0.27,0.0,0.00,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.5,0.14,0.0,0.39,0.00,0.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.14,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.39,0.0,0.13,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.15,0.06,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.18,0.0,0.0,0.37,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00


In [144]:
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.08,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.22,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
10,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.03
100,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.22,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.17,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.34,0.0,0.0,0.00,0.25,0.31,0.18,0.00,0.0,0.0,0.00,0.34,0.0,0.0,0.27,0.0,0.00,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.5,0.14,0.0,0.39,0.00,0.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.14,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.39,0.0,0.13,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.15,0.06,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.18,0.0,0.0,0.37,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00


In [146]:
query = "Do I need to know python to sign up for the January course?"

In [147]:
q = cv.transform([query]) # Transforming the query to a vector space based on our training data.
q.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.  

In [166]:
X.dot(q.T) # This is the dot product of the matrix X and the query q. This will give us the similarity between the query and the documents in the dataset

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 128 stored elements and shape (435, 1)>

In [167]:
X.dot(q.T).todense() # This is the similarity between the query and the documents in the dataset in a dense matrix
# It gives a number between 0 and 1. The closer to 1 the more similar the document is to the query

matrix([[0.4389195 ],
        [0.        ],
        [0.        ],
        [0.10272051],
        [0.09061387],
        [0.        ],
        [0.        ],
        [0.37634321],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.34569631],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.16813692],
        [0.        ],
        [0.        ],
        [0.11305004],
        [0.14072055],
        [0.        ],
        [0.09828835],
        [0.35236451],
        [0.17154441],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.39401731],
        [0.24051751],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.13137124],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.01703551],
        [0.05064299],
        [0.11656627],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [22]:
print(q)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 15)>
  Coords	Values
  (0, 2)	0.39515588491314224
  (0, 7)	0.39515588491314224
  (0, 11)	0.8292789960182417


In [158]:
query_dict = dict(zip(names, q.toarray()[0])) # Zip is important why it returns an iterator of tuples. This generation is on the fly and it is not stored in memory being useful for large datasets.
query_dict

{'01': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '13': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '2019': np.float64(0.0),
 '2024': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '403': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '7077': np.float64(0.0),
 '80': np.float64(0.0),
 '8080': np.float64(0.0),
 'able': np.float64(0.0),
 'access': np.float64(0.0),
 'account': np.float64(0.0),
 'activate': np.float64(0.0),
 'add': np.float64(0.0),
 'added': np.float64(0.0),
 'adding': np.float64(0.0),
 'additional': np.float64(0.0),
 'address': np.float64(0.0),
 'admin': np.float64(0.0),
 'airflow': np.float64(0.0),
 'alexey': np.float64(0.0),
 'allows': np.float64(0.0),
 'alternative': np.float64(0.0),
 'alternatively': np.float64(0.0),
 'anaconda': np.float64(0.0),
 'anaconda3': np.float64(0.0),
 'analytics

In [159]:
X.toarray()[1]

array([0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
     

In [161]:
names

array(['01', '04', '05', '10', '100', '11', '12', '13', '16', '17', '2019', '2024', '22', '24', '403', '5431', '5432', '7077', '80', '8080', 'able', 'access', 'account', 'activate', 'add', 'added',
       'adding', 'additional', 'address', 'admin', 'airflow', 'alexey', 'allows', 'alternative', 'alternatively', 'anaconda', 'anaconda3', 'analytics', 'anand', 'ans', 'answer', 'apache', 'api',
       'app', 'appear', 'appears', 'append', 'application', 'apply', 'appname', 'apt', 'archives', 'argument', 'ask', 'assigned', 'attempting', 'auth', 'authentication', 'automatically', 'available',
       'avoid', 'azure', 'bad', 'base', 'based', 'bash', 'bashrc', 'best', 'better', 'bigquery', 'bin', 'binary', 'bit', 'blob', 'block', 'bq', 'branch', 'browser', 'bucket', 'build', 'builder',
       'built', 'called', 'capstone', 'case', 'cast', 'cause', 'cd', 'certificate', 'change', 'changed', 'changes', 'changing', 'channel', 'check', 'checking', 'choose', 'chown', 'ci', 'class',
       'clear', 'c

In [162]:
# Match against the documents [1]
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'01': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '13': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '2019': np.float64(0.0),
 '2024': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '403': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '7077': np.float64(0.0),
 '80': np.float64(0.0),
 '8080': np.float64(0.0),
 'able': np.float64(0.0),
 'access': np.float64(0.0),
 'account': np.float64(0.0),
 'activate': np.float64(0.0),
 'add': np.float64(0.0),
 'added': np.float64(0.0),
 'adding': np.float64(0.0),
 'additional': np.float64(0.0),
 'address': np.float64(0.0),
 'admin': np.float64(0.0),
 'airflow': np.float64(0.0),
 'alexey': np.float64(0.0),
 'allows': np.float64(0.0),
 'alternative': np.float64(0.0),
 'alternatively': np.float64(0.0),
 'anaconda': np.float64(0.0),
 'anaconda3': np.float64(0.0),
 'analytics

# Create a rank with the most important words
Our query is now a vector space that show the importance of each word based on the training data. We can now create a rank of the most important words in the training data find the more relevant document for the given query.

In [169]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

In [170]:
df_qd['query'] * df_qd['doc']

01          0.0
04          0.0
05          0.0
10          0.0
100         0.0
           ... 
yes         0.0
yml         0.0
youtube     0.0
zip         0.0
zoomcamp    0.0
Length: 726, dtype: float64

In [171]:
(df_qd['query'] * df_qd['doc']).sum() # WHat does this mean?

np.float64(0.0)

In [172]:
X.dot(q.T).toarray() # This is the dot product of the query and the document. CALLED DOT PRODUCT in linear algebra

array([[0.4389195 ],
       [0.        ],
       [0.        ],
       [0.10272051],
       [0.09061387],
       [0.        ],
       [0.        ],
       [0.37634321],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.34569631],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.16813692],
       [0.        ],
       [0.        ],
       [0.11305004],
       [0.14072055],
       [0.        ],
       [0.09828835],
       [0.35236451],
       [0.17154441],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.39401731],
       [0.24051751],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.13137124],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01703551],
       [0.05064299],
       [0.11656627],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.048

In [173]:
X.dot(q.T).todense() # This is the equivalent of the cosine similarity. See Below.

matrix([[0.4389195 ],
        [0.        ],
        [0.        ],
        [0.10272051],
        [0.09061387],
        [0.        ],
        [0.        ],
        [0.37634321],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.34569631],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.16813692],
        [0.        ],
        [0.        ],
        [0.11305004],
        [0.14072055],
        [0.        ],
        [0.09828835],
        [0.35236451],
        [0.17154441],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.39401731],
        [0.24051751],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.13137124],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.01703551],
        [0.05064299],
        [0.11656627],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [174]:
X.dot(q.T).toarray().sum()

np.float64(13.40802668730218)

In [175]:
from sklearn.metrics.pairwise import cosine_similarity

In [176]:
cosine_similarity(X, q)

array([[0.4389195 ],
       [0.        ],
       [0.        ],
       [0.10272051],
       [0.09061387],
       [0.        ],
       [0.        ],
       [0.37634321],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.34569631],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.16813692],
       [0.        ],
       [0.        ],
       [0.11305004],
       [0.14072055],
       [0.        ],
       [0.09828835],
       [0.35236451],
       [0.17154441],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.39401731],
       [0.24051751],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.13137124],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01703551],
       [0.05064299],
       [0.11656627],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.048

In [195]:
score = cosine_similarity(X, q).flatten()
score

array([0.48473178, 0.        , 0.        , 0.31854037, 0.        , 0.        , 0.        , 0.41562408, 0.        , 0.        , 0.        , 0.29620951, 0.        , 0.        , 0.        , 0.18568623,
       0.        , 0.        , 0.12484966, 0.        , 0.        , 0.        , 0.54697146, 0.18944938, 0.        , 0.        , 0.        , 0.43514292, 0.22318698, 0.        , 0.        , 0.        ,
       0.        , 0.09195517, 0.        , 0.        , 0.        , 0.        , 0.        , 0.05282779, 0.07852295, 0.12873289, 0.        , 0.07198112, 0.        , 0.        , 0.        , 0.        ,
       0.04563625, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.11311737, 0.        ,
       0.01755816, 0.        , 0.        , 0.        , 0.        , 0.1134715 , 0.        , 0.        , 0.        , 0.        , 0.        , 0.08889334, 0.        , 0.        , 0.1052712 , 0.        ,
     

In [198]:
np.argsort(score) # This is list of the sorted based on their importance. To the last 5 elements are the most probable answers to the query

array([276, 290, 289, 286, 285, 284, 283, 282, 281, 280, 279, 278, 277, 291, 275, 274, 273, 272, 271, 270, 269, 268, 266, 264, 263, 262, 305, 317, 316, 315, 314, 313, 312, 311, 310, 309, 308, 307,
       306, 261, 304, 303, 302, 301, 299, 298, 297, 296, 295, 294, 293, 292, 220, 233, 232, 231, 230, 229, 228, 227, 225, 224, 223, 222, 221, 234, 219, 218, 426, 216, 215, 214, 213, 212, 211, 210,
       208, 207, 248, 260, 259, 258, 257, 256, 255, 254, 253, 252, 251, 250, 249, 318, 247, 246, 245, 244, 242, 240, 239, 238, 237, 236, 235,  37,  54,  53,  52,  51,  50,  49,  47,  46,  45,  44,
        42,  38,  55,  36,  35,  34, 383, 382, 381, 380, 379, 378, 377, 376, 375,  24, 425, 424, 423, 422, 421, 420, 419, 418, 417, 416,  56, 384, 374,  32, 408, 434, 433, 432,  63,  61,  60,  59,
        58,  57, 427, 335, 348, 347, 346, 345, 344, 343, 341, 340, 339, 338, 337, 336, 349, 334, 332, 331, 330, 329, 328, 326, 325, 324, 323, 320, 319, 361, 373, 372, 371, 370, 369, 368, 367, 366,
       365, 364

In [206]:
np.argsort(score)[-5:] # Here we have the documents with the most matching words to the query. It doesn't mean that the answer is correct but it is the most probable. In our case the correct answer in the number 7

array([ 3,  7, 27,  0, 22])

In [207]:
df.iloc[7].text

'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'

In [209]:
fields = ['section', 'question', 'text']
matrices = {} # X
vectorizers = {} # cv
for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

In [210]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 1561 stored elements and shape (435, 36)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 1613 stored elements and shape (435, 135)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 10493 stored elements and shape (435, 726)>}

In [211]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [222]:
query

'I just singned up. Is it too late to join the course?'

In [226]:
n = len(df)
score = np.zeros(n)
query = 'I just singned up. Is it too late to join the course?'
# Now in our datasets we have 3 fields: section, question and text. We will search in all of them
# But we can boost the score of the question field because it is more imporant for the matching

boosts = {
    # 'section': 0.5,
    'question': 3, 
    'text': 0.5
}
for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]
    f_score = cosine_similarity(X, q).flatten()
    boost = boosts.get(f, 1.0)
    score += f_score * boost

In [227]:
score

array([3.3232046 , 3.49747389, 3.08083871, 2.9766751 , 3.49747389, 3.49747389, 2.43628975, 3.70528593, 2.72258503, 3.49747389, 3.49747389, 2.49351213, 0.49747389, 0.49747389, 0.49747389, 0.590317  ,
       0.49747389, 2.64463111, 0.55989871, 0.49747389, 0.49747389, 0.49747389, 0.77095961, 0.59219857, 0.49747389, 0.49747389, 0.49747389, 0.71504535, 0.60906738, 0.49747389, 0.49747389, 0.49747389,
       0.49747389, 1.71640633, 3.49747389, 1.99612322, 0.49747389, 0.49747389, 0.49747389, 0.52388778, 0.53673536, 1.98015558, 0.49747389, 0.53346444, 0.        , 0.        , 0.        , 0.        ,
       0.02281813, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.05655868, 0.        ,
       0.00877908, 0.        , 0.        , 0.        , 0.        , 0.05673575, 0.        , 0.        , 0.        , 0.        , 0.        , 0.04444667, 0.        , 0.        , 0.0526356 , 0.        ,
     

In [233]:
# This is needs to filter only for our course. Our df is already filtered for our course so we will have all 1s in the mask 
filters = {
    'course': 'data-engineering-zoomcamp'
}
for field, value in filters.items():
    mask = (df[field] == value).astype(int)
    score *= mask
mask

0      1
1      1
2      1
3      1
4      1
      ..
430    1
431    1
432    1
433    1
434    1
Name: course, Length: 435, dtype: int64

In [234]:

idx = np.argsort(score)[-5:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."


In [235]:
df.columns

Index(['course', 'section', 'question', 'text'], dtype='object')

In [236]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

In [237]:
transformers['text'].get_feature_names_out()

array(['01', '04', '05', '10', '100', '11', '12', '13', '16', '17', '2019', '2024', '22', '24', '403', '5431', '5432', '7077', '80', '8080', 'able', 'access', 'account', 'activate', 'add', 'added',
       'adding', 'additional', 'address', 'admin', 'airflow', 'alexey', 'allows', 'alternative', 'alternatively', 'anaconda', 'anaconda3', 'analytics', 'anand', 'ans', 'answer', 'apache', 'api',
       'app', 'appear', 'appears', 'append', 'application', 'apply', 'appname', 'apt', 'archives', 'argument', 'ask', 'assigned', 'attempting', 'auth', 'authentication', 'automatically', 'available',
       'avoid', 'azure', 'bad', 'base', 'based', 'bash', 'bashrc', 'best', 'better', 'bigquery', 'bin', 'binary', 'bit', 'blob', 'block', 'bq', 'branch', 'browser', 'bucket', 'build', 'builder',
       'built', 'called', 'capstone', 'case', 'cast', 'cause', 'cd', 'certificate', 'change', 'changed', 'changes', 'changing', 'channel', 'check', 'checking', 'choose', 'chown', 'ci', 'class',
       'clear', 'c

In [238]:
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10493 stored elements and shape (435, 726)>

In [239]:
query = "I just singned up. Is it too late to join the course?"

In [240]:
q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [241]:
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask
score[:10]

array([0.48473178, 0.        , 0.        , 0.31854037, 0.        , 0.        , 0.        , 0.41562408, 0.        , 0.        ])

In [242]:
import numpy as np

# Set NumPy print options for better readability
np.set_printoptions(threshold=np.inf, edgeitems=3, linewidth=200, suppress=True)

<Token var=<ContextVar name='format_options' default={'edgeitems': 3, 'threshold': 1000, 'floatmode': 'maxprec', 'precision': 8, 'suppress': False, 'linewidth': 75, 'nanstr': 'nan', 'infstr': 'inf', 'sign': '-', 'formatter': None, 'legacy': 9223372036854775807, 'override_repr': None} at 0x7f3dfceccc70> at 0x7f3d7b0d4980>

In [243]:
idx = np.argsort(-score)[:10]
idx

array([ 22,   0,  27,   7,   3,  11, 395,  28, 411, 148])

In [244]:
score[idx]

array([0.54697146, 0.48473178, 0.43514292, 0.41562408, 0.31854037, 0.29620951, 0.28580355, 0.22318698, 0.21202195, 0.20653448])

In [245]:
df.iloc[idx].text

22     It's up to you which platform and environment ...
0      The purpose of this document is to capture fre...
27     You can do most of the course without a cloud....
7      Yes, we will keep all the materials after the ...
3      You don't need it. You're accepted. You can al...
11     No, you can only get a certificate if you fini...
395    Each submitted project will be evaluated by 3 ...
28     Yes, you can. Just remember to adapt all the i...
411    The display name listed on the leaderboard is ...
148    One service account is enough for all the serv...
Name: text, dtype: object

In [246]:
fields

['section', 'question', 'text']

In [247]:
query = "I just singned up. Is it too late to join the course?"

In [248]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [249]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [40]:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.'},
 {'course': 'data-eng

In [250]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [252]:
fields

['section', 'question', 'text']

In [253]:
index = TextSearch(text_fields=['section', 'question', 'text'])

In [254]:
index.fit(documents)

In [255]:
query

'I just singned up. Is it too late to join the course?'

In [256]:
index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [48]:
from sklearn.decomposition import TruncatedSVD

In [49]:
X = matrices['text']
cv = transformers['text']

In [50]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [51]:
X_emb[0]

array([ 0.08800419, -0.07512153,  0.10092401,  0.05127205,  0.05138965,
       -0.05951887,  0.02004213,  0.0607822 , -0.20294215,  0.33437825,
        0.03862638, -0.08756796, -0.13102394, -0.02912721,  0.03607929,
        0.03620086])

In [53]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)

In [81]:
Q_emb[0]

array([ 0.0435364 , -0.03070852,  0.04387019,  0.01232817,  0.0260592 ,
       -0.04886131,  0.01335269,  0.02430529, -0.12124854,  0.1794836 ,
        0.01503188,  0.06727616, -0.0731015 ,  0.04771703,  0.01171046,
        0.03114754])

In [82]:
np.dot(X_emb[0], Q_emb[0])

0.120735142515765

In [86]:
score = cosine_similarity(X_emb, Q_emb).flatten()

In [89]:
idx = np.argsort(-score)[:10]

In [92]:
list(df.loc[idx].text)

['If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on

In [93]:
from sklearn.decomposition import NMF

In [103]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.30663126,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [104]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.00114538, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.1713533 ,
       0.        , 0.        , 0.        , 0.        , 0.00066279,
       0.        ])

In [105]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of Nov

In [69]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

In [70]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')


In [71]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [72]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [73]:
hidden_states.shape

torch.Size([2, 15, 768])

In [75]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [78]:
sentence_embeddings.numpy()

# note that if use a GPU, first you need to move your tensors to CPU
# sentence_embeddings_cpu = sentence_embeddings.cpu()

array([[ 0.35999233, -0.16072303,  0.35452363, ...,  0.04289245,
         0.03482292, -0.03822247],
       [ 0.17849916, -0.500025  ,  0.25277564, ..., -0.11413109,
        -0.3360847 ,  0.4109514 ]], dtype=float32)

In [80]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [81]:
from tqdm.auto import tqdm

In [84]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [86]:
embeddings = {}

In [None]:
# fields = ['section', 'question', 'text']

for f in fields:
    print(f'computing embeddings for {f}...')
    embeddings[f] = compute_embeddings(df[f].tolist())

computing embeddings for section...


  0%|          | 0/119 [00:00<?, ?it/s]

computing embeddings for question...


  0%|          | 0/119 [00:00<?, ?it/s]

In [None]:
import pickle

In [None]:
with open('embeddings.bin', 'wb') as f_out:
    pickle.dump(embeddings)