In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
from torch.nn import CosineSimilarity
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


### This notebook is to conduct user query based filtering. Meaning given a user query, we will go through three filtering process.

1. Perform similarity check for course info and wiki article title using FAISS (course info and wiki articles embedding with SentenceTransformer)
2. Use trained two tower model to filter wiki articles based on user query + course info (course info was summarized to keywords using KWbert, then merged with user query to doing embedding using two tower model)
3. Use pre-trained rerank model RankGPT to do re-ranking and pick the most important articles 

## Prepare Data

In [2]:
file_path = 'Combined_course_data.csv'
course = pd.read_csv(file_path)
course

Unnamed: 0,Title,Description,Subject
0,Introduction to Business Analytics,This course provides students with an introduc...,Computer Science
1,Business Analytics Immersion Programme,This course aims to equip students with a firs...,Computer Science
2,Econometrics Modeling for Business Analytics,This course provides the foundations to econom...,Computer Science
3,Data Management and Visualisation,This course aims to provide students with prac...,Computer Science
4,Feature Engineering for Machine Learning,This course covers topics that are important f...,Computer Science
...,...,...,...
1911,Introduction to Hyperledger Sovereign Identity...,"To the surprise of absolutely no one, trust is...",Computer Science
1912,A System View of Communications: From Signals ...,Have you ever wondered how information is tran...,Computer Science
1913,Scripting and Programming Foundations,Computer programs are abundant in many people'...,Computer Science
1914,Using GPUs to Scale and Speed-up Deep Learning,Training acomplex deep learning model with a v...,Data Science


In [3]:
file_path = 'wikidata.csv'
wikidata = pd.read_csv(file_path)
wikidata

Unnamed: 0,text,url,title
0,"Becurtovirus is a genus of viruses, in the fam...",https://en.wikipedia.org/wiki/Becurtovirus,Becurtovirus
1,Cyprinivirus is a genus of viruses in the orde...,https://en.wikipedia.org/wiki/Cyprinivirus,Cyprinivirus
2,"Glossinavirus is a genus of viruses, in the fa...",https://en.wikipedia.org/wiki/Glossinavirus,Glossinavirus
3,"Ichtadenovirus is a genus of viruses, in the f...",https://en.wikipedia.org/wiki/Ichtadenovirus,Ichtadenovirus
4,"Lambdatorquevirus is a genus of viruses, in th...",https://en.wikipedia.org/wiki/Lambdatorquevirus,Lambdatorquevirus
...,...,...,...
131044,A non-blanching rash (NBR) is a skin rash that...,https://en.wikipedia.org/wiki/Non-blanching%20...,Non-blanching rash
131045,"In organic chemistry, the term cyanomethyl (cy...",https://en.wikipedia.org/wiki/Cyanomethyl,Cyanomethyl
131046,Remaiten is malware which infects Linux on emb...,https://en.wikipedia.org/wiki/Remaiten,Remaiten
131047,Gradient-enhanced kriging (GEK) is a surrogate...,https://en.wikipedia.org/wiki/Gradient-enhance...,Gradient-enhanced kriging


In [4]:
course_transformed = pd.DataFrame({
    "content": course.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in course.columns]), axis=1)
})

# Transform `wikidata` DataFrame to a single-column format
wikidata_transformed = pd.DataFrame({
    "content": wikidata.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in wikidata.columns]), axis=1)
})

# Display the transformed tables
print("Transformed Course Data:")
print(course_transformed.head())

print("\nTransformed Wikidata:")
print(wikidata_transformed.head())

Transformed Course Data:
                                             content
0  Title: Introduction to Business Analytics | De...
1  Title: Business Analytics Immersion Programme ...
2  Title: Econometrics Modeling for Business Anal...
3  Title: Data Management and Visualisation | Des...
4  Title: Feature Engineering for Machine Learnin...

Transformed Wikidata:
                                             content
0  text: Becurtovirus is a genus of viruses, in t...
1  text: Cyprinivirus is a genus of viruses in th...
2  text: Glossinavirus is a genus of viruses, in ...
3  text: Ichtadenovirus is a genus of viruses, in...
4  text: Lambdatorquevirus is a genus of viruses,...


## Load wiki_title_embedding

In [5]:
wiki_embeddings_file = 'wiki_title_embeddings.npy'
wiki_title_embeddings = np.load(wiki_embeddings_file)
wiki_title_embeddings

array([[-0.01740786,  0.00442912, -0.09215238, ..., -0.02191604,
         0.07291625, -0.02235293],
       [-0.10091388,  0.0783674 , -0.04533364, ..., -0.1075331 ,
         0.04686709,  0.07207245],
       [-0.10018466, -0.00640676, -0.0114509 , ..., -0.14957273,
         0.06115797,  0.02614287],
       ...,
       [-0.03868212,  0.05411112,  0.00084907, ...,  0.01953804,
        -0.01381   , -0.04266216],
       [-0.09186076, -0.1078757 ,  0.04518463, ..., -0.042975  ,
        -0.03663828,  0.01403402],
       [-0.06280275,  0.0021886 , -0.00058878, ..., -0.0114022 ,
        -0.0395432 , -0.0105731 ]], dtype=float32)

## 1. Perform similarity check for course info and wiki article title using FAISS (13k -> 500)

In [6]:
course_info = course_transformed['content'][17]
course_info

'Title: Data-Driven Marketing | Description: In today’s environment, marketing or business analysts require tools and techniques to both quantify the strategic value of marketing initiatives, and to maximize marketing campaign performance. This course aims to teach students concepts, methods and tools to demonstrate the return on investment (ROI) of marketing activities and to leverage on data and marketing analytics to make better and more informed marketing decisions. Course topics covered include marketing performance management, marketing metrics, data management, market response and diffusion models, market and customer segmentation models, analytic marketing and value driven segmentation, digital media marketing analytics, etc. Students will have access to | Subject: Computer Science'

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
course_embedding = model.encode(course_info)

# Step 3: Use FAISS to retrieve top 500 relevant Wikipedia titles based on the course embedding
dimension = wiki_title_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(wiki_title_embeddings)

# Retrieve the top 50 most relevant Wikipedia titles
num_candidates = 500
_, top_k_indices = faiss_index.search(np.array([course_embedding]), num_candidates)

# Filter the top 30 Wikipedia entries
top_500_wikidata = pd.DataFrame()
top_500_wikidata = wikidata_transformed.iloc[top_k_indices[0]].reset_index(drop=True)

In [8]:
top_500_wikidata

Unnamed: 0,content
0,text: User behavior analytics (UBA) is a cyber...
1,"text: Data mining, the process of discovering ..."
2,text: A rally is a period of sustained increas...
3,text: The AIDA model is just one of a class of...
4,text: The category development index (CDI) mea...
...,...
495,text: Demand priority is a media-access method...
496,"text: In systems science, a sampled-data syst..."
497,"text: For bees, their forage or food supply co..."
498,text: Information quality (InfoQ) is the poten...


## 2. Use trained two tower model to filter wiki articles based on user query + course info (500 -> 50)

In [9]:
import pandas as pd
from keybert import KeyBERT
import torch
from transformers import AutoTokenizer, AutoModel
from torch.nn import CosineSimilarity

# Initialize models and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
query_model = AutoModel.from_pretrained("query_model_scidocs").to(device)
document_model = AutoModel.from_pretrained("document_model_scidocs").to(device)
tokenizer = AutoTokenizer.from_pretrained("tokenizer_scidocs")
cosine_sim = CosineSimilarity(dim=1)
kw_model = KeyBERT()

# Define your utility functions
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64).to(device)
    embeddings = model(**inputs).last_hidden_state[:, 0, :]  # CLS token embedding
    return embeddings

def extract_keywords(content, model):
    keywords = model.extract_keywords(content, keyphrase_ngram_range=(3, 3), stop_words='english',
                                      use_maxsum=True, nr_candidates=20, top_n=5)
    merged_keywords = " ".join([kw[0] for kw in keywords])
    return merged_keywords

def calculate_similarity(query_text, document_text):
    query_embedding = encode_text(query_text, tokenizer, query_model)
    document_embedding = encode_text(document_text, tokenizer, document_model)
    similarity_score = cosine_sim(query_embedding, document_embedding).item()
    return similarity_score

def refine_user_query_1(query, kw):
    return query + "which has following keywords:" + kw


In [10]:
# Mock user's query
user_query = "Can you help me make a study plan for the course: Data-Driven Marketing?"  

In [11]:
merged_keywords = extract_keywords(course_info, kw_model)
user_query = refine_user_query_1(user_query, merged_keywords)
user_query

'Can you help me make a study plan for the course: Data-Driven Marketing?which has following keywords:marketing description today investment roi marketing marketing analytics make marketing decisions course leverage data marketing'

In [None]:
top_50_candidates = []



query_embedding = encode_text(user_query, tokenizer, query_model)


for _, row in top_500_wikidata.iterrows():
    # Step 2: Extract keywords from the document content
    
    
    # Step 3: Embed the merged keywords using the document model
    doc_embedding = encode_text(row['content'], tokenizer, document_model)
    
    # Step 4: Calculate similarity score between the course embedding and document embedding
    similarity_score = cosine_sim(query_embedding, doc_embedding).item()
    
    # Store title, keywords, and similarity score
    top_50_candidates.append({
        "Content": row['content'],
        #"Keywords": merged_keywords,
        "Similarity Score": similarity_score
    })

In [None]:
top_50_df = pd.DataFrame(top_50_candidates).sort_values(by="Similarity Score", ascending=False).head(50)
top_50_df

Unnamed: 0,Content,Similarity Score
0,text: User behavior analytics (UBA) is a cyber...,0.989267
268,text: To classify postoperative outcomes for e...,0.988383
421,text: Software installed in medical devices is...,0.988083
364,text: Unsupervised learning is a type of algo...,0.987775
342,text: This is a list of mathematics-based meth...,0.987758
8,text: Usage data is the most effective way of ...,0.987555
34,text: Learning pathway is described as the cho...,0.987185
318,text: Macrotasking is a type of crowdsourcing ...,0.987081
21,text: Utilising the DW/BI system is the final ...,0.987036
370,text: Data portability is a concept to protect...,0.986632


## 3. 3. Use pre-trained rerank model RankGPT to do re-ranking and pick the most important articles (50 -> 5)

In [None]:
import openai
from rank_gpt import permutation_pipeline
import pandas as pd

# Set up your OpenAI API key
openai.api_key = ""


# Convert top_5_df into the required format for `permutation_pipeline`
item = {
    "query": user_query,
    "hits": [{"content": content} for content in top_50_df["Content"]]  # Assuming `top_5_df` has a "Content" column
}

# Use `permutation_pipeline` to re-rank the top 5 articles and get the reordered list
new_item = permutation_pipeline(item, rank_start=0, rank_end=5, model_name='gpt-3.5-turbo', api_key=openai.api_key)

# Extract the top 3 articles from the re-ranked output
top_5_articles = [hit["content"] for hit in new_item["hits"][:5]]

top_5_articles

['text: User behavior analytics (UBA) is a cybersecurity process about detection of insider threats, targeted attacks, and financial fraud that tracks a system\'s users. UBA looks at patterns of human behavior, and then analyzes them to detect anomalies that indicate potential threats. Big data platforms like Apache Hadoop are increasing UBA functionality by allowing them to analyze petabytes worth of data to detect insider threats and advanced persistent threats.\n\nPurpose \nUBA\'s purpose, according to Johna Till Johnson of Nemertes Research, is that "Security systems provide so much information that it\'s tough to uncover information that truly indicates a potential for real attack. Analytics tools help make sense of the vast amount of data that SIEM, IDS/IPS, system logs, and other tools gather. UBA tools use a specialized type of security analytics that focuses on the behavior of systems and the people using them. UBA technology first evolved in the field of marketing, to help co

In [None]:
def construct_document(course_info, top_articles):
    # Template to create a more readable and coherent document structure
    document = f"Course Overview:\n{course_info}\n\n"
    for i, article in enumerate(top_articles, 1):
        document += f"Related Article {i}:\n"
        document += f"Title: Article {i}\n"
        document += f"Content: {article}\n"
        if i < len(top_articles):
            document += "\n\nIn addition, the following article provides insights:\n\n"
    return document

final_document = construct_document(course_info, top_5_articles)
print(final_document)


Course Overview:
Title: Data-Driven Marketing | Description: In today’s environment, marketing or business analysts require tools and techniques to both quantify the strategic value of marketing initiatives, and to maximize marketing campaign performance. This course aims to teach students concepts, methods and tools to demonstrate the return on investment (ROI) of marketing activities and to leverage on data and marketing analytics to make better and more informed marketing decisions. Course topics covered include marketing performance management, marketing metrics, data management, market response and diffusion models, market and customer segmentation models, analytic marketing and value driven segmentation, digital media marketing analytics, etc. Students will have access to | Subject: Computer Science

Related Article 1:
Title: Article 1
Content: text: User behavior analytics (UBA) is a cybersecurity process about detection of insider threats, targeted attacks, and financial fraud 

In [18]:
def save_text_to_file(text, filename="final_document.txt"):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(text)
    print(f"Document saved as {filename}")

# Example usage
save_text_to_file(final_document)


Document saved as final_document.txt
