In [1]:
import json
import pandas as pd
import os

from qdrant_client import QdrantClient
from langchain.schema import Document
from qdrant_client.http import models

from tqdm.auto import tqdm

In [2]:
def get_files(start_path):
    paths = []
    for root, dirs, files in os.walk(start_path):
        for file in files:
            if file != ".DS_Store":
                paths.append(os.path.join(root, file))

    return paths

In [3]:
CONTEXT_PARENT_FOLDER = "contexts"
METADATA_PARENT_FOLDER = "Metadata Extraction/metadata"

COLLECTION_NAME = "georgebrown-v2"
VECTOR_SIZE = 384
DB_FILE = "vector_db_v5"

In [4]:
contexts = get_files(CONTEXT_PARENT_FOLDER)

In [5]:

metadata = get_files(METADATA_PARENT_FOLDER)

In [6]:
contexts[:5]

['contexts/otherpages/How to Pay for College  George Brown College.csv',
 'contexts/otherpages/Industry Liaison Office  George Brown College.csv',
 'contexts/otherpages/Degree Programs  George Brown College.csv',
 'contexts/otherpages/News and Announcements for Current Students  George Brown College.csv',
 'contexts/otherpages/HS fact - programs with clinical placement  George Brown College.csv']

In [7]:
metadata[:5]

['Metadata Extraction/metadata/programpages2024/Financial Planning Program Postgraduate B407  George Brown College2024.json',
 'Metadata Extraction/metadata/programpages2024/Advanced Manufacturing Program T414  George Brown College2024.json',
 'Metadata Extraction/metadata/programpages2024/Honours Bachelor of Food Studies Program Bridging H318  George Brown College2024.json',
 'Metadata Extraction/metadata/programpages2024/Honours Bachelor of Behaviour Analysis Program Year 3 Bridge S303  George Brown College2024.json',
 'Metadata Extraction/metadata/programpages2024/Honours Bachelor of Business Administration Program Business Analytics B303  George Brown College2024.json']

In [8]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Qdrant


In [9]:
embeddings = SentenceTransformerEmbeddings(model_name='sentence-transformer-finetuned/georgebrown-v2-embeddings')

In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformer-finetuned/georgebrown-v2-embeddings', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [11]:

qdrant_client = QdrantClient(path=DB_FILE)

In [12]:
qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE),
)


True

In [13]:
qdrant = Qdrant(qdrant_client, COLLECTION_NAME, embeddings)

In [14]:
def insertOtherPages(context_df):
    
    metadata = {"is_program_page": False}
    documents = [Document(page_content=row["Context"], metadata=metadata) for _, row in context_df.iterrows()]
    qdrant.add_documents(documents)


In [15]:
def insertProgramPages(context_df, metadata_json, year):
    
    for _, row in context_df.iterrows():
        
        metadata = {"is_program_page": True}
        metadata.update(metadata_json)
        documents = [Document(page_content=row["Context"], metadata=metadata) for _, row in context_df.iterrows()]
    
    qdrant.add_documents(documents)

In [16]:
for context_path in tqdm(contexts):
    context_df = pd.read_csv(context_path)
    
    
    metadata_path = context_path.replace(CONTEXT_PARENT_FOLDER, METADATA_PARENT_FOLDER)
    metadata_path = metadata_path.replace(".csv", ".json")
    
    if metadata_path not in metadata:
        insertOtherPages(context_df)
# #         break
        continue
    
    
    fp = open(metadata_path, "r")
    metadata_dict = json.load(fp)
    fp.close()
    
#     print(metadata_dict.get("Program Information").get("Program Name"), "\n")

    year = 2024 if "programpages2024" in context_path else 2023
#         print(context_path, context_df.columns, context_df.shape)
    insertProgramPages(context_df, metadata_dict, year)
    
    
        

  0%|          | 0/1642 [00:00<?, ?it/s]

In [91]:
resd[0].payload.get("metadata")

{'is_program_page': False}

In [21]:
def clean_contexts():
    query = "There is no specific factual information related to programs offered by the college in the given text."
    vec = embeddings.embed_query(query)
    resd = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=vec,
        limit = 1000,
        score_threshold=0.50
    )
    
    invalid_contexts = [point.id for point in resd]
    
    qdrant_client.delete(
        collection_name=COLLECTION_NAME,
        points_selector=models.PointIdsList(
            points=invalid_contexts,
        ),
    )

In [22]:
deleted_contexts = clean_contexts()

In [31]:
from qdrant_client.http import models as rest

query = "Courses available in Applied AI program"

q_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="metadata.is_program_page",
            match=models.MatchValue(
                value=True
            ),
        )
    ]
)
found_docs = qdrant.similarity_search(query,filter=q_filter)

In [32]:
found_docs

[Document(page_content=" The text provides information about various courses offered under different programs at the college, primarily in the field of Data Science, Machine Learning, Deep Learning, and Information Systems Business Analysis. Here's the extracted information:\n\nPrograms:\n- T177 Computer Programming and Analysis\n- T405 Information Systems Business Analysis Program (with Experiential Learning Capstone)\n- T430 Mobile Application Development and Strategy\n- T445 Cloud Computing Technologies Program (Postgraduate)\n- Computer Science (university level)\n- Computer Engineering (university level)\n\nCourses in Semester 1:\n1. AASD 4000 - Machine Learning I\n2. AASD 4001 - Applied Mathematical Concepts for Machine Learning\n3. AASD 4002 - Foundations of Data Management\n4. AASD 4003 - Ethics and Law for Data Science\n5. AASD 4004 - Machine Learning II\n6. AASD 4008 - Big Data Tools and Techniques I\n7. AASD 4006 - Data Visualization Techniques\n8. AASD 4007 - Design Thinkin

In [41]:
client.scroll(
    collection_name="{collection_name}",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="diet[].food", match=models.MatchValue(value="meat")
            ),
            models.FieldCondition(
                key="diet[].likes", match=models.MatchValue(value=True)
            ),
        ],
    ),
)