In [1]:
import pickle
import marshal
import types
import torch
import transformers

# Function to get embedding for a new input question
def get_question_embedding(text, mtokenizer, mmodel, device):
    inputs = mtokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = mmodel(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # Apply average pooling
        sentence_embedding = last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
    return sentence_embedding

def find_similar_questions(input_text, mtokenizer, mmodel,  pca_class, my_cosine, embed, device, train_sample, top_n=5):

  input_embedding = get_question_embedding(input_text, mtokenizer, mmodel, device)

  # Reduce the dimensionality of the input embedding to match the stored embeddings
  input_embedding_reduced = pca_class.transform(input_embedding.reshape(1, -1))

  # Calculate cosine similarity with each question embedding in train_sample
  similarities = my_cosine(input_embedding_reduced, embed)

  # Get the top N most similar questions
  top_indices = similarities[0].argsort()[-top_n:][::-1]
  similar_questions = train_sample.iloc[top_indices][['name']]
  return similar_questions

# Device= 'cuda' if torch.cuda.is_available() else 'cpu' 
my_model = pickle.load(open("./../../../pre_trained_model/embedding/embed_model.pkl", 'rb'))
my_tokenizer= pickle.load( open("./../../../pre_trained_model/embedding/embed_tokenizer.pkl", 'rb') )
my_cosine= pickle.load(open("./../../../pre_trained_model/embedding/embed_cosine.pkl", 'rb') )
my_pca= pickle.load(open("./../../../pre_trained_model/embedding/embed_pca.pkl", 'rb') )
my_embed= pickle.load(open("./../../../pre_trained_model/embedding/embed_embeddings.pkl", 'rb') )
my_train_sample= pickle.load(open("./../../../pre_trained_model/embedding/embed_train_sample.pkl", 'rb') )

ModuleNotFoundError: No module named 'torch'

In [2]:
input_question = "three sum"
find_similar_questions(input_question, my_tokenizer, my_model, my_pca, my_cosine, my_embed, 'cpu',my_train_sample,  20 )



Unnamed: 0,name
1791,Divide Intervals Into Minimum Number of Groups
1510,Longest Subsequence Repeated k Times
1472,Remove Stones to Minimize the Total
1062,Minimum Subsequence in Non-Increasing Order
1252,Minimum Initial Energy to Finish Tasks
1392,Minimum Interval to Include Each Query
1754,Design a Food Rating System
2281,Replace Question Marks in String to Minimize I...
1619,Minimum Difference in Sums After Removal of El...
1321,Maximum Score From Removing Stones
