In [1]:
import pandas as pd
import requests
import logging
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"

try:
    response = requests.get(url=docs_url)
except Exception as e:
    logging.error(f"Failed to downloads docs due to: {e}")
else:
    docs_json = response.json()
    docs_df = pd.json_normalize(docs_json, "documents", ["course"])
    docs_df = docs_df[["course", "section", "question", "text"]]

print(docs_df.head())
print(docs_df[docs_df["course"] == "data-engineering-zoomcamp"].head())

                      course                           section  \
0  data-engineering-zoomcamp  General course-related questions   
1  data-engineering-zoomcamp  General course-related questions   
2  data-engineering-zoomcamp  General course-related questions   
3  data-engineering-zoomcamp  General course-related questions   
4  data-engineering-zoomcamp  General course-related questions   

                                            question  \
0               Course - When will the course start?   
1  Course - What are the prerequisites for this c...   
2  Course - Can I still join the course after the...   
3  Course - I have registered for the Data Engine...   
4   Course - What can I do before the course starts?   

                                                text  
0  The purpose of this document is to capture fre...  
1  GitHub - DataTalksClub data-engineering-zoomca...  
2  Yes, even if you don't register, you're still ...  
3  You don't need it. You're accepted. You can

In [18]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]


In [23]:
cv = CountVectorizer(stop_words="english", min_df=5)
X = cv.fit_transform(docs_df["text"])
X_df = pd.DataFrame(X.toarray(), columns=cv.fit(docs_df["text"]).get_feature_names_out())
print(X_df.T)

          0    1    2    3    4    5    6    7    8    9    ...  938  939  \
01          0    0    0    0    0    0    0    0    0    0  ...    0    0   
02          0    0    0    0    0    0    0    0    0    0  ...    0    0   
03          0    0    0    0    0    0    0    0    0    0  ...    0    0   
04          0    0    0    0    0    0    0    0    0    0  ...    0    0   
05          0    0    0    0    0    0    0    0    0    0  ...    0    0   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
yes         0    0    1    0    0    0    1    1    1    0  ...    0    0   
yml         0    0    0    0    0    0    0    0    0    0  ...    0    0   
youtube     0    0    0    0    0    0    0    0    0    1  ...    0    0   
zip         0    0    0    0    0    0    0    0    0    0  ...    0    0   
zoomcamp    0    1    0    0    0    2    0    0    0    0  ...    0    0   

          940  941  942  943  944  945  946  947  
01          0    0    0 