In [None]:
#May need to install the following
pip install sentence-transformers
pip install tabulate

In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tabulate import tabulate
import os
import pickle

# Load the dataset
schools_df = pd.read_excel('schools.xlsx')##load schools dataset

# User input simulation (replace these with actual input prompts in practice)
user_preferences = {
    'Languages Taught': 'Finnish, Swedish',
    'Programmes Offered': 'Physical Education',
    'Special Education Classes': 'Yes',
    'City': 'Helsinki'
}

# Convert user preferences into a search query
search_query = ', '.join(user_preferences.values())

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L12-v2')

# Encode the search query since it changes every time
query_embedding = model.encode(search_query)

# Specify the path for the school embeddings
school_embeddings_path = 'school_embeddings.pkl'

# Function to save embeddings
def save_embeddings(path, embeddings):
    with open(path, 'wb') as f:
        pickle.dump(embeddings, f)

# Function to load embeddings
def load_embeddings(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# Check if the school embeddings file exists, if not, create embeddings and save them
if not os.path.exists(school_embeddings_path):
    school_embeddings = model.encode(schools_df[['Languages Taught', 'Programmes Offered', 'Special Education Classes', 'City']].agg(', '.join, axis=1))
    save_embeddings(school_embeddings_path, school_embeddings)
    print('School embeddings created and saved...')
else:
    school_embeddings = load_embeddings(school_embeddings_path)
    print('Loaded school embeddings...')


# Compute semantic similarity (cosine similarity)
similarity_scores = util.pytorch_cos_sim(query_embedding, school_embeddings)

# Parameters for filtering
threshold = 0.80  # similarity threshold (decrease for more results)
top_k = 5  # number of top schools to select

# Sort schools based on similarity score and apply threshold
sorted_indices = (-similarity_scores).argsort()[0]
filtered_indices = [i for i in sorted_indices if similarity_scores[0, i] > threshold]

# Check if there are any schools matching the criteria
if len(filtered_indices) == 0:
    print("No schools match your preferences. Please adjust your criteria.")
else:
    # Adjust top_k if there are fewer schools than requested
    top_school_indices = filtered_indices[:min(top_k, len(filtered_indices))]
    recommended_schools = schools_df.iloc[top_school_indices]
    if len(recommended_schools) > 0:
        print("Recommended schools based on your preferences:")
        print(tabulate(recommended_schools, headers='keys', tablefmt='grid'))
    else:
        print("No schools match your preferences after applying the threshold. Please adjust your criteria or threshold.")


Loaded school embeddings...
Recommended schools based on your preferences:
+----+-----------------------------------------+--------------------+---------------------------------------------+-----------------------------+----------+
|    | School Name                             | Languages Taught   | Programmes Offered                          | Special Education Classes   | City     |
| 19 | Mäkelänrinne Upper Secondary School     | Finnish            | General, Physical education                 | Yes                         | Helsinki |
+----+-----------------------------------------+--------------------+---------------------------------------------+-----------------------------+----------+
| 11 | Brändö gymnasium Upper Secondary School | Swedish            | General upper secondary, Physical education | Yes                         | Helsinki |
+----+-----------------------------------------+--------------------+---------------------------------------------+-------------------------

In [1]:
"""Using GPT-4V"""
import os
import requests
import base64
import matplotlib.pyplot as plt
import requests
from PIL import Image
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tabulate import tabulate
import pickle

schools_df = pd.read_excel('schools.xlsx') ##load schools dataset

##unique terms in the dataset. Required to ensure correct output from the LLM.
languages_taught = schools_df['Languages Taught'].unique().tolist()
programmes_offered = schools_df['Programmes Offered'].unique().tolist()
special_education_classes = schools_df['Special Education Classes'].unique().tolist()
cities = schools_df['City'].unique().tolist()

# Configuration
GPT4V_KEY = ""  ###OpenAI API key required here. I have used GPT-4V model for this experiment.

headers = {
    "Content-Type": "application/json",
    "api-key": GPT4V_KEY,
}
GPT4V_ENDPOINT = "https://hhazure-openai-dev.openai.azure.com/openai/deployments/GPT4Vision/chat/completions?api-version=2023-07-01-preview"

# Your input message
input_message = "Hi. I am a fitness instructor, currently working in Hamberg, Germany. \
I want to visit a suitable Finnish school in Helsinki that offers physical education programmes in Finnish or Swedish, \
preferably with special education classes. My visit will be one week long."

#System prompt to ensure that the terms come from the 
system_message = f"A teacher applies for a school visit through an application form. In a simple description, he mentions his preferences for\
a particular language taught in the school, the programme of his interest, special education class, and city. \
Extract the required languages, programmes, special education class ('yes' or 'not available'), \
and city from the input message. The terms for all these four should come from the following lists: \
Languages Taught: {', '.join(languages_taught)}; \
Programmes Offered: {', '.join(programmes_offered)}; \
Special Education Classes: {', '.join(special_education_classes)}; \
Cities: {', '.join(cities)}. \
Do not make up any term. If you don't find a matching term, return 'not available' for its response. \
Your output should only be the extracted terms separated by a comma."

payload = {
    "messages": [
        {
            "role": "system",
            "content": system_message
        },
        {
            "role": "user",
            "content": input_message
        }
    ],
    "max_tokens": 800
}
# Send request
try:
    response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload)
    response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code.
except requests.RequestException as e:
    raise SystemExit(f"Failed to make the request. Error: {e}")

# Handle the response
response_json = response.json()  # Assuming 'response' is your response object
search_query = response_json['choices'][0]['message']['content'] if response_json.get('choices') else "No response content."

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L12-v2')

# Encode the search query and school data
query_embedding = model.encode(search_query)

# Specify the path for the school embeddings
school_embeddings_path = 'school_embeddings2.pkl'

# Function to save embeddings
def save_embeddings(path, embeddings):
    with open(path, 'wb') as f:
        pickle.dump(embeddings, f)

# Function to load embeddings
def load_embeddings(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# Check if the school embeddings file exists, if not, create embeddings and save them
if not os.path.exists(school_embeddings_path):
    school_embeddings = model.encode(schools_df[['Languages Taught', 'Programmes Offered', 'Special Education Classes', 'City']].agg(', '.join, axis=1))
    save_embeddings(school_embeddings_path, school_embeddings)
    print('School embeddings created and saved...')
else:
    school_embeddings = load_embeddings(school_embeddings_path)
    print('Loaded school embeddings...')

# Compute semantic similarity
similarity_scores = util.pytorch_cos_sim(query_embedding, school_embeddings)

# Parameters for filtering
threshold = 0.80  # similarity threshold
top_k = 5  # number of top schools to select

# Sort schools based on similarity score and apply threshold
sorted_indices = (-similarity_scores).argsort()[0]
filtered_indices = [i for i in sorted_indices if similarity_scores[0, i] > threshold]

# Check if there are any schools matching the criteria
if len(filtered_indices) == 0:
    print("No schools match your preferences. Please adjust your criteria.")
else:
    # Adjust top_k if there are fewer schools than requested
    top_school_indices = filtered_indices[:min(top_k, len(filtered_indices))]
    recommended_schools = schools_df.iloc[top_school_indices]
    if len(recommended_schools) > 0:
        print("Recommended schools based on your preferences:")
        print(tabulate(recommended_schools, headers='keys', tablefmt='grid'))
    else:
        print("No schools match your preferences after applying the threshold. Please adjust your criteria or threshold.")


Loaded school embeddings...
Recommended schools based on your preferences:
+----+-----------------------------------------+--------------------+---------------------------------------------+-----------------------------+----------+
|    | School Name                             | Languages Taught   | Programmes Offered                          | Special Education Classes   | City     |
| 19 | Mäkelänrinne Upper Secondary School     | Finnish            | General, Physical education                 | Yes                         | Helsinki |
+----+-----------------------------------------+--------------------+---------------------------------------------+-----------------------------+----------+
| 11 | Brändö gymnasium Upper Secondary School | Swedish            | General upper secondary, Physical education | Yes                         | Helsinki |
+----+-----------------------------------------+--------------------+---------------------------------------------+-------------------------