In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [10]:

df= pd.read_pickle("data/course_catalog_with_embeddings.pkl")
df.head()

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Description Embeddings,Title Embeddings
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.1893), tensor(0.4135), tensor(-0.14...","[tensor(0.1341), tensor(0.0716), tensor(-0.119..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.2007), tensor(0.4261), tensor(0.143...","[tensor(0.0525), tensor(0.3550), tensor(-0.113..."
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.0163), tensor(0.5243), tensor(-0.14...","[tensor(-0.2652), tensor(0.4275), tensor(-0.41..."
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(0.5637), tensor(0.1932), tensor(-0.447...","[tensor(0.2329), tensor(0.0559), tensor(-0.580..."
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.1039), tensor(0.4214), tensor(-0.43...","[tensor(0.2310), tensor(-0.0024), tensor(-0.41..."


In [3]:
# desc_embeddings = dict(zip(df['Code'], df['Description Embeddings']))
# title_embeddings = dict(zip(df['Code'], df['Title Embeddings']))

In [14]:
def filter(df, upper_div, lower_div, graduate, include, exclude):
    """
    Optimized filter function for a DataFrame based on level of study and department inclusion/exclusion.

    Parameters:
    - df: DataFrame to filter.
    - upper_div: Boolean, True to include Upper Division levels.
    - lower_div: Boolean, True to include Lower Division levels.
    - graduate: Boolean, True to include Graduate levels.
    - include: List of departments to include.
    - exclude: List of departments to exclude.

    Returns:
    - Optimized filtered DataFrame based on the specified criteria.
    """
    # Create a boolean series for each level condition

    conditions = pd.Series(False, index=df.index)
    if upper_div:
        conditions |= (df['Level'] == 'Upper Division')
    if lower_div:
        conditions |= (df['Level'] == 'Lower Division')
    if graduate:
        conditions |= (df['Level'] == 'Graduate')
    
    # Apply level filtering
    df = df[conditions]
    
    # Apply department inclusion and exclusion
    if include:
        df = df[df['Department'].isin(include)]
    if exclude:
        df = df[~df['Department'].isin(exclude)]
    
    return df

In [15]:
filter(False, False, True, ['POLI', 'ECON'], ['MATH'])

UnboundLocalError: cannot access local variable 'df' where it is not associated with a value

In [None]:
def parse_query(q):
    encoded = model.encode(q, convert_to_tensor=True)
    return encoded

In [None]:
def cos_sim(q_tensor, tensor_dict):
    scores = {}
    for id, tensor in tensor_dict.items():
        magnitude_A = q_tensor.norm()
        magnitude_B = tensor.norm()
        similarity = torch.dot(q_tensor.squeeze(), tensor) / (magnitude_A * magnitude_B)
        # only output scores that are high enough
        if similarity >= 0.2:
            scores[id] = similarity.item()
    return scores

In [14]:
def emb_search(query, k, df):
    """
    Search for the top k most similar items in df to the query using cosine similarity.
    
    Parameters:
    - query: The search query string.
    - k: Number of top similar items to return.
    - df: DataFrame with a 'Description Embedding' column containing embeddings.
    - model: The model used to encode the query.
    
    Returns:
    Top k most similar items
    """
    model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')

    # Encode the query to get the query embedding
    query_emb = model.encode(query)  # Assuming the model has an 'encode' method
    
    # Get the embeddings from the dataframe
    desc_emb_list = np.vstack(df['Description Embeddings'])
    title_emb_list = np.vstack(df['Title Embeddings'])
    
    # Calculate cosine similarities
    desc_similarities = cosine_similarity([query_emb], desc_emb_list)[0]
    title_similarities = cosine_similarity([query_emb], title_emb_list)[0]
    combined_similarities = desc_similarities + title_similarities
    
    # Get the indices of the top k most similar embeddings
    top_k_indices = np.argsort(combined_similarities)[-k:][::-1]

    # Return the top k most similar items from df
    top_k_results = df.iloc[top_k_indices][['Code', 'Department', 'Title', 'Description', 'Prerequisites', 'URL']]

    return top_k_results.to_numpy()


In [15]:
emb_search('game theory', 10, df)

array([['ECON 109', 'ECON', 'Game Theory',
        'Introduction to game theory. Analysis of people’s decisions when the consequences of the decisions depend on what other people do. This course features applications in economics, political science, and law.',
        'ECON 100C or MATH 31CH or MATH 109 or (CSE 20 and MATH 20 C).',
        'https://cape.ucsd.edu/responses/Results.aspx?Name=&CourseNumber=ECON+109'],
       ['POLI 118', 'POLI', 'Game Theory in Political Science',
        'This course introduces students to game theory and its uses in political science. Topics covered include the concepts of Nash equilibrium, dominant strategies, subgame perfection and backwards induction, and the applications of those concepts to the study of voting, electoral competition, public goods provision, legislatures, and collective action. An emphasis is placed on developing students’ analytical reasoning and problem-solving skills through weekly problem sets and in-class exercises.',
        '