# Download the data:

In [1]:
import requests 

docs_url = 'https://raw.githubusercontent.com/tejasjbansal/QueryGenie/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
# Creating the dataframe:

import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,Anaconda3-2024.02-1-Windows-x86_64.exe\nThe pu...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub/data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


# Implementing Basic Text Search

## Basics of Text Search

- Information Retrieval - The process of obtaining relevant information from large datasets based on user queries.
- Vector Spaces - A mathematical representation where text is converted into vectors (points in space) allowing for quantitative comparison.
- Bag of Words - A simple text representation model treating each document as a collection of words disregarding grammar and word order but keeping multiplicity.
- TF-IDF (Term Frequency-Inverse Document Frequency) - A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.

## Vectorization
- Count Vectorizer
- TfidfVectorizer

# CountVectorizer:

- Simpler approach.
- Creates a "bag-of-words" representation of the text.
- Counts the number of times each word appears in a document.
- Treats all words equally, regardless of their importance in the broader context.

In [4]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


# Tf-idfVectorizer (TF-IDF Vectorizer):

- More advanced approach.
- Also considers the "inverse document frequency" (IDF) of a word.
- Words that appear frequently across all documents are down-weighted, as they are less informative.
- Words that are specific to a particular document are up-weighted, as they are more distinctive.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.46,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.46
course,0.37,0.0,0.0,0.0,0.37
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.46
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.46,0.0,0.0,0.0,0.0
listed,0.0,0.58,0.0,0.0,0.0


# Query-Document Similarity

In [8]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77828292, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [11]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'15th': 0.0,
 '2024': 0.0,
 'cloud': 0.0,
 'course': 0.6279137616509933,
 'date': 0.0,
 'github': 0.0,
 'google': 0.0,
 'homeworks': 0.0,
 'jan': 0.0,
 'listed': 0.0,
 'participation': 0.0,
 'prerequisites': 0.0,
 'python': 0.7782829228046183,
 'registration': 0.0,
 'required': 0.0,
 'setup': 0.0,
 'start': 0.0,
 'starts': 0.0,
 'submit': 0.0}

In [17]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'15th': 0.0,
 '2024': 0.0,
 'cloud': 0.0,
 'course': 0.0,
 'date': 0.0,
 'github': 0.5773502691896258,
 'google': 0.0,
 'homeworks': 0.0,
 'jan': 0.0,
 'listed': 0.5773502691896258,
 'participation': 0.0,
 'prerequisites': 0.5773502691896258,
 'python': 0.0,
 'registration': 0.0,
 'required': 0.0,
 'setup': 0.0,
 'start': 0.0,
 'starts': 0.0,
 'submit': 0.0}

In [18]:
# The more words in common - the better the matching score. Let's calculate it:

In [16]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

(df_qd['query'] * df_qd['doc']).sum()

0.0

In [23]:
# This is a dot-product. So we can use matrix multiplication to compute the score b/w each document and then rank accordingly:
X.dot(q.T).toarray()

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# In practice, we usually use cosine similarity:
cosine_similarity(X, q)

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

# Vectorizing all the documents

In [24]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<997x2199 sparse matrix of type '<class 'numpy.float64'>'
	with 28221 stored elements in Compressed Sparse Row format>

In [39]:
# Search

query = "I just singned up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [29]:
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask

In [30]:
import numpy as np

idx = np.argsort(-score)[:10]

In [38]:
# Get the docs:


df.iloc[idx].text

0      Anaconda3-2024.02-1-Windows-x86_64.exe\nThe pu...
15     No, late submissions are not allowed. But if t...
22     It's up to you which platform and environment ...
27     You can do most of the course without a cloud....
38     You will have two attempts for a project. If t...
288    This error could result if you are using some ...
7      Yes, we will keep all the materials after the ...
3      You don't need it. You're accepted. You can al...
114    In the join queries, if we mention the column ...
11     No, you can only get a certificate if you fini...
Name: text, dtype: object