# Importing libraries

In [26]:
import requests 
import json
import pandas as pd
import numpy as np

# Reading the file containing the data from all the courses in a form of Dictionary (Course - > Section -> Question -> Answer)

In [27]:
# Reading JSON file
with open('data/documents.json', 'r') as file:
    documents_raw = json.load(file)

# To store all the documents dictionary from all the courses
documents = []

# Looping through each of the three courses data we have
for course in documents_raw:
    course_name = course['course']
    # Looping through each of the QnA documents corresponding to each course
    for doc in course['documents']:
        # Adding an extra key "course" to the existing document with keys as - section, question, text
        doc['course'] = course_name
        documents.append(doc)

In [28]:
# Converting the data into pandas data frame
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - See DE-zoomcamp prerequisites
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
1026,mlops-zoomcamp,Module 6: Best practices,Why do I get a ValueError: Invalid endpoint er...,Answer: Boto3 does not support underscores (_)...
1027,mlops-zoomcamp,Capstone Project,Why do I get a “ValueError: The truth value of...,Solution: Follow the tip: When you compare two...
1028,mlops-zoomcamp,Capstone Project,Is it a group project?,"No, the capstone is a solo project."
1029,mlops-zoomcamp,Capstone Project,"Do we submit 2 projects, what does attempt 1 a...",You only need to submit 1 project. \nIf the su...


In [29]:
# Restricting our search space onlt to "Data ENgineering Course"
df[df.course=="data-engineering-zoomcamp"]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - See DE-zoomcamp prerequisites
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
435,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
436,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
437,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
438,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


# Vector Space

- Turn Document into Vectors
- Create term-document matrix:
  > - Rows: Documents (Values will be 0/1 indicating absence/presence of a word in that document)
  > - Columns: words/tokens
  > - This representation is called <b style="color:red">"bag of words"</b> - here we ignore the order of words, just focus on the words themselves. In many cases this is sufficient and gives pretty good results already.

## Count Vectorizer

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer()

In [32]:
cv.fit(df.text)

In [33]:
cv.get_feature_names_out()

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [34]:
cv.get_feature_names_out().shape

(6993,)

So, there are 6993 distinct tokens available in our data, and most of which have only been found in one documents. So, we are restricting the occurances of tokens in number of documents

<b> NOTE: </b> <span style="color:red"> Here document means each set of <b>{Course - > Section -> Question -> Answer}</b> </span>

In [59]:
# Restricting Only to get the tokens which have occured in the atleast 5 documents, and also removing the Englidh stopwords
cv = CountVectorizer(min_df=5, stop_words='english') 

In [60]:
cv.fit(df.text)

In [66]:
# Sparse Matrix 
X = cv.transform(df.text)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [61]:
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'zip', 'zoomcamp', 'zoomcamps'],
      dtype=object)

In [62]:
cv.get_feature_names_out().shape

(1438,)

In [70]:
# Bag of words representation
df_docs = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
df_docs

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp,zoomcamps
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF