# Importing libraries

In [2]:
import requests 
import json
import pandas as pd
import numpy as np

# Reading the file containing the data from all the courses in a form of Dictionary (Course - > Section -> Question -> Answer)

In [3]:
# Reading JSON file
with open('data/documents.json', 'r') as file:
    documents_raw = json.load(file)

# To store all the documents dictionary from all the courses
documents = []

# Looping through each of the three courses data we have
for course in documents_raw:
    course_name = course['course']
    # Looping through each of the QnA documents corresponding to each course
    for doc in course['documents']:
        # Adding an extra key "course" to the existing document with keys as - section, question, text
        doc['course'] = course_name
        documents.append(doc)

In [4]:
# Converting the data into pandas data frame
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - See DE-zoomcamp prerequisites
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
1026,mlops-zoomcamp,Module 6: Best practices,Why do I get a ValueError: Invalid endpoint er...,Answer: Boto3 does not support underscores (_)...
1027,mlops-zoomcamp,Capstone Project,Why do I get a “ValueError: The truth value of...,Solution: Follow the tip: When you compare two...
1028,mlops-zoomcamp,Capstone Project,Is it a group project?,"No, the capstone is a solo project."
1029,mlops-zoomcamp,Capstone Project,"Do we submit 2 projects, what does attempt 1 a...",You only need to submit 1 project. \nIf the su...


In [5]:
# Restricting our search space onlt to "Data ENgineering Course"
df[df.course=="data-engineering-zoomcamp"]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - See DE-zoomcamp prerequisites
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
435,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
436,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
437,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
438,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


# Vector Space

- Turn Document into Vectors
- Create term-document matrix:
  > - Rows: Documents (Values will be 0/1 indicating absence/presence of a word in that document)
  > - Columns: words/tokens
  > - This representation is called <b style="color:red">"bag of words"</b> - here we ignore the order of words, just focus on the words themselves. In many cases this is sufficient and gives pretty good results already.
  > - It is a sparse matrix

## Count Vectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer()

In [8]:
cv.fit(df.text)

In [9]:
cv.get_feature_names_out()

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [10]:
cv.get_feature_names_out().shape

(6993,)

So, there are 6993 distinct tokens available in our data, and most of which have only been found in one documents. So, we are restricting the occurances of tokens in number of documents

<b> NOTE: </b> <span style="color:red"> Here document means each set of <b>{Course - > Section -> Question -> Answer}</b> </span>

In [11]:
# Restricting Only to get the tokens which have occured in the atleast 5 documents, and also removing the Englidh stopwords
cv = CountVectorizer(min_df=5, stop_words='english') 

In [12]:
cv.fit(df.text)

In [13]:
# Sparse Matrix 
X = cv.transform(df.text)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'zip', 'zoomcamp', 'zoomcamps'],
      dtype=object)

In [15]:
cv.get_feature_names_out().shape

(1438,)

In [16]:
# Bag of words representation
df_docs = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
df_docs

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp,zoomcamps
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<b style="color:red; font-size:1.5em">The more frequent a term/word is, less importance it is i.e., if a term/word comes in large number of documents, less importance it will be. So, we make use of <span style="color:blue;">TF-IDF</span> as it gives more importance to less frequent term.</b>

## TF-IDF

<b style="color:orange; font-size:1.3em">We have TF (Term Frequency) and IDF (Inverse Document Frequency)</b>

<b>TF</b> = (Number of occurances of a term in a document) <b>/</b> (Total number of terms in a document)

<b>IDF</b> = log<b>(</b> (Total Number of Documents) <b>/</b> (Number of documents in which a particular term has occured) <b>)</b>

<b>TF</b> is calculated for all the terms in every document, and <b>IDF</b> is calculated for all the terms overall.

<b>TF-IDF</b> is calculated by multiplying TF and IDF, and higher the value for a term, the more important that term would be.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
cv = TfidfVectorizer(min_df=5, stop_words='english')

In [19]:
X = cv.fit_transform(df.text)
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26616 stored elements and shape (1031, 1438)>

In [20]:
names = cv.get_feature_names_out()
names

array(['01', '02', '03', ..., 'zip', 'zoomcamp', 'zoomcamps'],
      dtype=object)

In [21]:
df_docs = pd.DataFrame(X.toarray(), columns=names).round(2)
df_docs

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp,zoomcamps
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.74,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.00,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.09,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0
1027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0
1028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0
1029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0


<b style="color:blue; font-size:1.3em">The above matrix shows the importance of every word in every document.</b>
> <b>Columns</b> are terms

> <b>Indexes</b> are documents

In [22]:
# Comparing the similarity of the following query with the documents
query = "Do I need to know python to sign up for the January course?"

# Transforming the query into a vector
q = cv.transform([query])

q.toarray(), q.shape

(array([[0., 0., 0., ..., 0., 0., 0.]]), (1, 1438))

In [23]:
query_dict = dict(zip(names, q.toarray()[0]))
doc_dict = dict(zip(names, X.toarray()[0])) # Document 0

query_dict # Query weights and term mapping

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '4566': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.flo

In [24]:
doc_dict # Document 0 weights and term mapping

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '4566': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.flo

In [25]:
# Dataframe having query and document0 weights
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T
df_qd

Unnamed: 0,query,doc
01,0.0,0.0
02,0.0,0.0
03,0.0,0.0
04,0.0,0.0
05,0.0,0.0
...,...,...
yml,0.0,0.0
youtube,0.0,0.0
zip,0.0,0.0
zoomcamp,0.0,0.0


In [26]:
# Multiplying both and summing up to get the similarity score
(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.0875150772730388)

<b style="color:blue; font-size:1.3em">The above multiplication and summing is known as Cosine Similarity</b>

<b style="color:green; font-size:1.3em">We will find the similarity score with all the documents, and for whichever document, the score will be highest, that document will be more similar to our query</b>

In [27]:
# Finding similarity score of query with every document
# Method-1: Using dot product
X.dot(q.T).toarray()

array([[0.08751508],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.02234234],
       [0.08657901]])

In [28]:
# Method-2: Using predefined library
from sklearn.metrics.pairwise import cosine_similarity
score = cosine_similarity(X, q).flatten()
score

array([0.08751508, 0.        , 0.        , ..., 0.        , 0.02234234,
       0.08657901])

In [29]:
# Sorting the scores to see which document is most similar to our query
np.argsort(score)

array([530, 543, 542, ..., 811, 582, 451])

So, the document 451 is the most similar to our query

In [47]:
df.columns, df.shape # We have 1031 documents

(Index(['course', 'section', 'question', 'text'], dtype='object'), (1031, 4))

<b style="color:red; font-size:1.3em">For now we have only found the similarity using the vector created out of 'text', but we need to be mindful to do the same with 'section' and 'question' too, and then later sum up the similarity score across all, and then the document having maximum sum score will be most similar to our query

# Finding similarity across all the columns and summing up the similarity score

In [31]:
fields = ['section', 'question', 'text']

vectorizers = {} # To store cv formed across different fields
matrices = {} # To store weight matrix formed across different fields

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3) # Vector creation
    X = cv.fit_transform(df[field]) # Weighted Matrix formation

    # Storing the vectorizer i.e, cv corresponding to the field
    vectorizers[field] = cv
    # Storing the weight matrix formed for the field
    matrices[field] = X

In [36]:
vectorizers['section'].get_feature_names_out()

array(['10', 'analytics', 'best', 'capstone', 'classification', 'column',
       'course', 'cpp_type', 'data', 'dbt', 'decision', 'deep',
       'deploying', 'deployment', 'dlthub', 'docker', 'does',
       'dolocationid', 'double', 'engineering', 'ensemble', 'error',
       'evaluation', 'experiment', 'external_fhv_tripdata', 'general',
       'int64', 'introduction', 'kafka', 'kubernetes', 'learning',
       'machine', 'match', 'message', 'metrics', 'midterm',
       'miscellaneous', 'models', 'module', 'monitoring', 'networks',
       'neural', 'orchestration', 'parquet', 'practices', 'project',
       'projects', 'pyspark', 'question', 'questions', 'reading',
       'regression', 'related', 'risingwave', 'serverless', 'serving',
       'streaming', 'table', 'target', 'tensorflow', 'terraform',
       'tracking', 'trees', 'trips_data_all', 'type', 'warehousing',
       'workflow', 'workshop'], dtype=object)

In [37]:
vectorizers['question'].get_feature_names_out()

array(['10', '11', '12', '127', '13', '2022', '2023', '2024', '22', '403',
       '404', '5000', '5432', '7d', '8080', 'aborted', 'access',
       'account', 'accuracy', 'adding', 'additional', 'address',
       'airflow', 'allowed', 'alternative', 'answer', 'apache', 'api',
       'app', 'apple', 'argument', 'array', 'artifacts', 'attempt',
       'attribute', 'attributeerror', 'auc', 'authentication', 'autolog',
       'available', 'aws', 'bad', 'bashrc', 'batch', 'bentoml', 'big',
       'bigquery', 'bin', 'binary', 'bind', 'bq', 'browser', 'bucket',
       'buckets', 'build', 'building', 'built', 'calculate', 'calling',
       'capstone', 'case', 'certain', 'certificate', 'change', 'changing',
       'character', 'chart', 'check', 'checking', 'chip', 'choice', 'ci',
       'class', 'classes', 'classification', 'cli', 'cloud', 'cluster',
       'code', 'codespaces', 'cohort', 'colab', 'column', 'columns',
       'com', 'command', 'commands', 'commit', 'compilation', 'compose',
     

In [35]:
vectorizers['text'].get_feature_names_out()

array(['001', '01', '02', ..., 'zoomcamp', 'zoomcampqabot', 'zoomcamps'],
      dtype=object)

In [58]:
# Comparing the similarity of the following query with the documents
query = "I just singned up. Is it too late to join the course?"

# Transforming the query into a vector across all the fields- section, question, and text
fields = ['section', 'question', 'text']

# To store similarity score across all the fields
sim_score = {}

for field in fields:
    q = vectorizers[field].transform([query])
    X = matrices[field]
    sim_score[field] = X.dot(q.T).toarray().flatten()

In [61]:
# Summing all the similarity score across all the fields for every document
sim_score_result = sim_score['section'] + sim_score['question'] + sim_score['text']
sim_score_result

array([1.20940803, 1.02333617, 1.23874361, ..., 0.        , 0.        ,
       0.07183701])

In [63]:
# Checking which document has maximum sum of similarity score
np.argsort(sim_score_result)

array([596, 580, 579, ...,   2, 819, 455])

From above we can see that the document 596 is most similar to our quer