In [27]:
import urllib.request, json

file_path = '../LaMP_2.json'


with open(file_path, 'r') as json_file:
    data = json.load(json_file)


ratio = 0.6
start = 0
end = 5#len(data)

### Download & preparing data

In [28]:
from langchain.schema import Document

In [29]:
def get_docs(idx):
  content = []
  for i in range(len(data[idx]['profile'])):
      content.append(data[idx]['profile'][i]['text'])
  documents = [Document(page_content=text) for text in content]
  return documents

### BM25 retriever

In [30]:
from rank_bm25 import BM25Okapi
from collections import namedtuple

DocumentWithScore = namedtuple('DocumentWithScore', ['document', 'score'])

def bm25(documents, query, ratio):
    tokenized_corpus = [(doc.page_content).split(" ") for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split(" ")
    bm25_scores = bm25.get_scores(tokenized_query)

    onlydocs = [x.page_content for x in documents]
    documents_with_scores = [
        DocumentWithScore(document=doc, score=score)
        for doc, score in zip(onlydocs, bm25_scores)
    ]

    sorted_documents_with_scores = sorted(documents_with_scores, key=lambda x: x.score, reverse=True)

    top_n_documents_with_scores = sorted_documents_with_scores[:int(len(documents)*ratio)]

    document_instances = [Document(page_content=doc.document) for doc in top_n_documents_with_scores]
    return document_instances, top_n_documents_with_scores

### TF-IDF Retriever

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import namedtuple

DocumentWithScore = namedtuple('DocumentWithScore', ['document', 'score'])

def tfidf_retrieval_sklearn(documents, query, ratio):
    top_n = int(len(documents)*ratio)
    docs = [(doc.page_content) for doc in documents]
    vectorizer = TfidfVectorizer()

    tfidf_matrix = vectorizer.fit_transform(docs)

    query_vector = vectorizer.transform([query])

    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    top_n_indices = similarity_scores.argsort()[0][-top_n:][::-1]

    top_n_documents = [docs[i] for i in top_n_indices]
    document_instances = [Document(page_content=doc) for doc in top_n_documents]

    top_n_documents_with_scores = [
        DocumentWithScore(document=docs[i], score=similarity_scores[0][i])
        for i in top_n_indices
    ]

    return document_instances, top_n_documents_with_scores

### Contriever msmarco

In [32]:
import torch
from transformers import AutoTokenizer, AutoModel
from collections import namedtuple

DocumentWithScore = namedtuple('DocumentWithScore', ['document', 'score'])
def contrievermsmarco(documents, query, ratio):
    content = [doc.page_content for doc in documents]
    top_n = int(len(content) * ratio)

   
    tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
    model = AutoModel.from_pretrained('facebook/contriever-msmarco')

    
    content_with_query = [query] + content

    
    inputs = tokenizer(content_with_query, padding=True, truncation=True, return_tensors='pt')

    
    outputs = model(**inputs)

   
    def mean_pooling(token_embeddings, mask):
        token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
        sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
        return sentence_embeddings

   
    query_embedding = mean_pooling(outputs[0][:1], inputs['attention_mask'][:1]).squeeze()

    
    document_embeddings = mean_pooling(outputs[0][1:], inputs['attention_mask'][1:]).squeeze()

    
    similarity_scores = torch.matmul(query_embedding, document_embeddings.T).tolist()

    
    document_with_scores = list(zip(content, similarity_scores))

   
    document_with_scores.sort(key=lambda x: x[1], reverse=True)
    document_with_scores = document_with_scores[:top_n]
    
    document_instances =  [Document(page_content=doc) for doc, _ in document_with_scores]
    top_n_documents_with_scores = [DocumentWithScore(document=doc, score=score) for doc, score in document_with_scores]
    
    return document_instances, top_n_documents_with_scores


### DistilBertTokenizer

In [33]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

def distil_bert(documents, query, ratio):
  
  tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
  model = DistilBertModel.from_pretrained('distilbert-base-uncased')

  encoded_query = tokenizer(query, return_tensors='pt')

  
  with torch.no_grad():
      query_embeddings = model(**encoded_query).last_hidden_state.mean(dim=1)


  document_texts = [doc.page_content for doc in documents]

  encoded_documents = tokenizer(document_texts, return_tensors='pt', padding=True, truncation=True)

  with torch.no_grad():
      document_embeddings = model(**encoded_documents).last_hidden_state.mean(dim=1)

  similarity_scores = torch.nn.functional.cosine_similarity(query_embeddings, document_embeddings, dim=1)

  ranked_documents = sorted(zip(documents, similarity_scores.tolist()), key=lambda x: x[1], reverse=True)


  top_n_documents_with_scores = ranked_documents[:int(len(documents)*ratio)]
  
  document_instances = [doc_tuple[0] for doc_tuple in top_n_documents_with_scores]
  
  top_n_documents_with_scores = [DocumentWithScore(document=doc.page_content, score=score)for doc, score in top_n_documents_with_scores]

  return document_instances, top_n_documents_with_scores


### Normalize Scores

In [34]:
from collections import namedtuple

DocumentWithScore = namedtuple('DocumentWithScore', ['document', 'score'])
def normalize_scores(document_with_scores):
    scores = [score for _, score in document_with_scores]
   
    score_range = max(scores) - min(scores)
    if score_range == 0:
        normalized_scores = [1.0] * len(scores)
    else:
        normalized_scores = [(score - min(scores)) / score_range for score in scores]

    normalized_document_with_scores = [ DocumentWithScore(document=doc, score=normalized_score) for (doc, _), normalized_score in zip(document_with_scores, normalized_scores)]

    return normalized_document_with_scores

### Combining retrievers and getting data

In [35]:
def filter_acc_ratio(ratio, result):
  top_index = int(len(result) * ratio)
  top_documents = result[:top_index]
  return top_documents

In [36]:
def combine(results):
  combined_list = []
  for sublist in results:
    combined_list.extend(sublist)
  return combined_list

In [37]:
from collections import OrderedDict
def union(documents_list):
  union_of_docs = list(OrderedDict.fromkeys(document.page_content for document in documents_list))
  return union_of_docs

### Putting retrieved profiles back in data


In [38]:
def put_retrieved_profiles(union_of_docs, idx):
  filtered_profiles = [profile for profile in data[idx]['profile'] if profile['text'] in union_of_docs]
  copy_of_data = (data[idx]).copy()
  copy_of_data['profile'] = filtered_profiles
  return copy_of_data

### LM

In [39]:

input_text = "Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink] article: It's hard to find a restaurant that doesn't now place a little card at your table inquiring if the establishment was: (a) really awful; (b) tolerable; (c) sublime. You can use the following list to understand how to categorise: [{'text': 'The three make a trip of atypical opera themes, but no new opera brought the Met as much controversy as Klinghoffer.',   'category': 'culture & arts'.}, {'text': Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven't forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer., 'category': 'religion', },{ 'text': 'I expected him to do well and show me some of his work. The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want to maintain periodic contact -- at least it was what I hoped -- I was wrong.', 'category': 'religion',}{ 'text': 'Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land that only I wanted her to reconnect with.', 'category': 'travel'}]"
input_text_2 = "Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink] article: It's hard to find a restaurant that doesn't now place a little card at your table inquiring if the establishment was: (a) really awful; (b) tolerable; (c) sublime. "


In [40]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

In [41]:
def LM(original_data):

  input_data = {
    'input': original_data['input'],
    'profile': [
        {'text': profile['text'], 'category': profile['category']} for profile in original_data['profile']
    ]
}

  question = input_data['input']
  context = ' '.join([f"{profile['category']}: {profile['text']}" for profile in input_data['profile']])
  input_text = f"question: {question} context: {context}"

 
  inputs = tokenizer(input_text, return_tensors="pt")
  outputs = model.generate(**inputs)

  
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_text

### Majority Vote

In [42]:
def majority_vote(ratio, st, end):
  mv_output = []
  for idx in range(st, end):
    
    documents = get_docs(idx)
    query = (data[idx]['input'])
    index_of_query= (data[idx]['input']).find("article:")

    query = (data[idx]['input'])[index_of_query + len("article:"):].strip()
    bm25_result= bm25(documents, query, ratio)
    bm25_result_docs = bm25_result[0]
    tfidf_result = tfidf_retrieval_sklearn(documents, query, ratio)
    tfidf_result_docs = tfidf_result[0]
    contri_result = contrievermsmarco(documents,query, ratio)
    contri_result_docs = contri_result[0]
    all_top_docs = [bm25_result_docs, tfidf_result_docs, contri_result_docs]
    all_top_docs = combine(all_top_docs)
    all_top_docs = union(all_top_docs)
    new_data_mv = put_retrieved_profiles(all_top_docs, idx)
    val = LM(new_data_mv)
    mv_output.append(val)
  return mv_output

### Borda count

In [43]:
def borda_count(ranked_lists):
    borda_scores = {}

   
    for i, ranked_list in enumerate(ranked_lists):
        for j, document in enumerate(ranked_list):
            borda_scores[document] = borda_scores.get(document, 0) + (len(ranked_list) - j)

    
    final_ranking = sorted(borda_scores.keys(), key=lambda x: borda_scores[x], reverse=True)

    return final_ranking

In [44]:
def borda_count_fn(ratio, st, end):
  bc_output = []
  for idx in range(st, end):
    documents = get_docs(idx)

    query = (data[idx]['input'])
    index_of_query= (data[idx]['input']).find("article:")

    query = (data[idx]['input'])[index_of_query + len("article:"):].strip()
    bm25_result= bm25(documents, query, ratio)
    bm25_result_docs = bm25_result[0]
    tfidf_result = tfidf_retrieval_sklearn(documents, query, ratio)
    tfidf_result_docs = tfidf_result[0]
    contri_result = contrievermsmarco(documents,query, ratio)
    contri_result_docs = contri_result[0]
    bm25_result_onlyDocs = [(doc.page_content) for doc in bm25_result_docs]
    tfidf_result_onlyDocs = [(doc.page_content) for doc in tfidf_result_docs]
    contri_result_onlyDocs = [(doc.page_content) for doc in contri_result_docs]

    all_top_docs = [bm25_result_onlyDocs, tfidf_result_onlyDocs, contri_result_onlyDocs]
    result = borda_count(all_top_docs)
    
    new_data_bc = put_retrieved_profiles(result, idx)
    val = LM(new_data_bc)
    bc_output.append(val)
  return bc_output

### Scoring based

#### Parameters: average, weighted_average, max, min, product, rank_based,linear_combination


In [45]:
def scoring_based(ratio, st, end, parameter):
  scor_output = []
  for idx in range(st, end):
    documents = get_docs(idx)

    query = (data[idx]['input'])
    index_of_query= (data[idx]['input']).find("article:")

    query = (data[idx]['input'])[index_of_query + len("article:"):].strip()
    bm25_result= bm25(documents, query, ratio)
    tfidf_result = tfidf_retrieval_sklearn(documents, query, ratio)
    contri_result = contrievermsmarco(documents,query, ratio)
    bm25_results_docScore = normalize_scores(bm25_result[1])
    tfidf_results_docScore = normalize_scores(tfidf_result[1])
    contri_results_docScore = normalize_scores(contri_result[1])

    weight_bm25 = 0.4
    weight_tfidf = 0.3
    weight_contri = 0.3

    average_scores = [(doc.score + tfidf_results_docScore[i].score + contri_results_docScore[i].score) / 3 for i, doc in enumerate(bm25_results_docScore)]
    weighted_average_scores = [(doc.score * weight_bm25 + tfidf_results_docScore[i].score * weight_tfidf + contri_results_docScore[i].score * weight_contri) / 3 for i, doc in enumerate(bm25_results_docScore)]
    max_scores = [max(doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score) for i, doc in enumerate(bm25_results_docScore)]
    min_scores = [min(doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score) for i, doc in enumerate(bm25_results_docScore)]
    product_scores = [doc.score * tfidf_results_docScore[i].score * contri_results_docScore[i].score for i, doc in enumerate(bm25_results_docScore)]
    rank_based_scores = [
      sum(
          [
              sorted([doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score]).index(doc.score),
              sorted([doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score]).index(tfidf_results_docScore[i].score),
              sorted([doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score]).index(contri_results_docScore[i].score)
          ]
      ) / 3
      for i, doc in enumerate(bm25_results_docScore)
  ]

    linear_combination_scores = [(doc.score * weight_bm25 + tfidf_results_docScore[i].score * weight_tfidf + contri_results_docScore[i].score * weight_contri) for i, doc in enumerate(bm25_results_docScore)]

    average_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, average_scores)]
    weighted_average_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, weighted_average_scores)]
    max_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, max_scores)]
    min_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, min_scores)]
    product_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, product_scores)]
    rank_based_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, rank_based_scores)]
    linear_combination_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, linear_combination_scores)]



    sorted_average_scored_documents = sorted(average_scored_documents, key=lambda x: x.score, reverse=True)

    sorted_weighted_average_scored_documents = sorted(weighted_average_scored_documents, key=lambda x: x.score, reverse=True)

    sorted_max_scored_documents = sorted(max_scored_documents, key=lambda x: x.score, reverse=True)

    sorted_min_scored_documents = sorted(min_scored_documents, key=lambda x: x.score, reverse=True)

    sorted_product_scored_documents = sorted(product_scored_documents, key=lambda x: x.score, reverse=True)

    sorted_rank_based_scored_documents = sorted(rank_based_scored_documents, key=lambda x: x.score, reverse=True)

    sorted_linear_combination_scored_documents = sorted(linear_combination_scored_documents, key=lambda x: x.score, reverse=True)



    sorted_average_scored_page_content = [doc_with_score.document for doc_with_score in sorted_average_scored_documents]

    sorted_weighted_average_scored_page_content = [doc_with_score.document for doc_with_score in sorted_weighted_average_scored_documents]

    sorted_max_scored_page_content = [doc_with_score.document for doc_with_score in sorted_max_scored_documents]

    sorted_min_scored_page_content = [doc_with_score.document for doc_with_score in sorted_min_scored_documents]

    sorted_product_scored_page_content = [doc_with_score.document for doc_with_score in sorted_product_scored_documents]

    sorted_rank_based_scored_page_content = [doc_with_score.document for doc_with_score in sorted_rank_based_scored_documents]

    sorted_linear_combination_scored_page_content = [doc_with_score.document for doc_with_score in sorted_linear_combination_scored_documents]

    mapping = {
    "average": sorted_average_scored_page_content,
    "weighted_average": sorted_weighted_average_scored_page_content,
    "max": sorted_max_scored_page_content,
    "min": sorted_min_scored_page_content,
    "product": sorted_product_scored_page_content,
    "rank_based": sorted_rank_based_scored_page_content,
    "linear_combination": sorted_linear_combination_scored_page_content
    }

    result = mapping[parameter] # change this accordingly
    new_data_scor = put_retrieved_profiles(result, idx)
    val = LM(new_data_scor)
    scor_output.append(val)
  return scor_output

### Re ranking

In [46]:
def re_rank_documents(bm25_results, query, percentage=0.5):
  num_relevant_documents = int(len(bm25_results) * percentage)

  relevant_bm25_documents = bm25_results[:num_relevant_documents]

  if(len(relevant_bm25_documents) == 1):
     re_ranked_results = relevant_bm25_documents
  else:
    relevant_bm25_documents = [Document(page_content=item.document) for item in relevant_bm25_documents]
    contriever_results_for_queries = contrievermsmarco(relevant_bm25_documents, query, 1.0)
    re_ranked_results = contriever_results_for_queries[1] 
   
  re_ranked_results.sort(key=lambda x: x.score, reverse=True)
  re_ranked_results_docs = [doc.document for doc in re_ranked_results]
  return re_ranked_results_docs


In [47]:
def re_rank(ratio, st, end):
  rr_output = []
  for idx in range(st, end):
    documents = get_docs(idx)

    query = (data[idx]['input'])
    index_of_query= (data[idx]['input']).find("article:")

    query = (data[idx]['input'])[index_of_query + len("article:"):].strip()
    bm25_result= bm25(documents, query, 0.6)
    bm25_result_docs = bm25_result[1]
    result = re_rank_documents(bm25_result_docs, query, 0.6)
    
    new_data_rr = put_retrieved_profiles(result, idx)
    val = LM(new_data_rr)
    rr_output.append(val)
  return rr_output

### True Output

In [48]:
with urllib.request.urlopen("https://ciir.cs.umass.edu/downloads/LaMP/LaMP_2/train/train_outputs.json") as url:
    true_output = json.load(url)
    true_output = true_output['golds']

### Ensemble together

In [49]:
def all_ensemble(rati0, st, end, parameter):
    scor_output = []
    bc_output = []
    mv_output = []
    for idx in range(st, end):
        documents = get_docs(idx)

        query = (data[idx]['input'])
        index_of_query= (data[idx]['input']).find("article:")

        query = (data[idx]['input'])[index_of_query + len("article:"):].strip()
        bm25_result= bm25(documents, query, ratio)
        tfidf_result = tfidf_retrieval_sklearn(documents, query, ratio)
        contri_result = contrievermsmarco(documents,query, ratio)
        bm25_results_docScore = normalize_scores(bm25_result[1])
        tfidf_results_docScore = normalize_scores(tfidf_result[1])
        contri_results_docScore = normalize_scores(contri_result[1])
        bm25_result_docs = bm25_result[0]
        tfidf_result_docs = tfidf_result[0]
        contri_result_docs = contri_result[0]
        bm25_result_onlyDocs = [(doc.page_content) for doc in bm25_result_docs]
        tfidf_result_onlyDocs = [(doc.page_content) for doc in tfidf_result_docs]
        contri_result_onlyDocs = [(doc.page_content) for doc in contri_result_docs]

        all_top_docs = [bm25_result_onlyDocs, tfidf_result_onlyDocs, contri_result_onlyDocs]
        bc_result = borda_count(all_top_docs)
        new_data_bc = put_retrieved_profiles(bc_result, idx)
        bc_val = LM(new_data_bc)
        bc_output.append(bc_val)

        
        all_top_docs = [bm25_result_docs, tfidf_result_docs, contri_result_docs]
        all_top_docs = combine(all_top_docs)
        all_top_docs = union(all_top_docs)
       
        new_data_mv = put_retrieved_profiles(all_top_docs, idx)
        mv_val = LM(new_data_mv)
        mv_output.append(mv_val)

        weight_bm25 = 0.4
        weight_tfidf = 0.3
        weight_contri = 0.3

        average_scores = [(doc.score + tfidf_results_docScore[i].score + contri_results_docScore[i].score) / 3 for i, doc in enumerate(bm25_results_docScore)]
        weighted_average_scores = [(doc.score * weight_bm25 + tfidf_results_docScore[i].score * weight_tfidf + contri_results_docScore[i].score * weight_contri) / 3 for i, doc in enumerate(bm25_results_docScore)]
        max_scores = [max(doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score) for i, doc in enumerate(bm25_results_docScore)]
        min_scores = [min(doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score) for i, doc in enumerate(bm25_results_docScore)]
        product_scores = [doc.score * tfidf_results_docScore[i].score * contri_results_docScore[i].score for i, doc in enumerate(bm25_results_docScore)]
        rank_based_scores = [
        sum(
            [
                sorted([doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score]).index(doc.score),
                sorted([doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score]).index(tfidf_results_docScore[i].score),
                sorted([doc.score, tfidf_results_docScore[i].score, contri_results_docScore[i].score]).index(contri_results_docScore[i].score)
            ]
        ) / 3
        for i, doc in enumerate(bm25_results_docScore)
    ]

        linear_combination_scores = [(doc.score * weight_bm25 + tfidf_results_docScore[i].score * weight_tfidf + contri_results_docScore[i].score * weight_contri) for i, doc in enumerate(bm25_results_docScore)]

        average_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, average_scores)]
        weighted_average_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, weighted_average_scores)]
        max_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, max_scores)]
        min_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, min_scores)]
        product_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, product_scores)]
        rank_based_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, rank_based_scores)]
        linear_combination_scored_documents = [DocumentWithScore(document=doc.document, score=score) for doc, score in zip(bm25_results_docScore, linear_combination_scores)]



        sorted_average_scored_documents = sorted(average_scored_documents, key=lambda x: x.score, reverse=True)

        sorted_weighted_average_scored_documents = sorted(weighted_average_scored_documents, key=lambda x: x.score, reverse=True)

        sorted_max_scored_documents = sorted(max_scored_documents, key=lambda x: x.score, reverse=True)

        sorted_min_scored_documents = sorted(min_scored_documents, key=lambda x: x.score, reverse=True)

        sorted_product_scored_documents = sorted(product_scored_documents, key=lambda x: x.score, reverse=True)

        sorted_rank_based_scored_documents = sorted(rank_based_scored_documents, key=lambda x: x.score, reverse=True)

        sorted_linear_combination_scored_documents = sorted(linear_combination_scored_documents, key=lambda x: x.score, reverse=True)



        sorted_average_scored_page_content = [doc_with_score.document for doc_with_score in sorted_average_scored_documents]

        sorted_weighted_average_scored_page_content = [doc_with_score.document for doc_with_score in sorted_weighted_average_scored_documents]

        sorted_max_scored_page_content = [doc_with_score.document for doc_with_score in sorted_max_scored_documents]

        sorted_min_scored_page_content = [doc_with_score.document for doc_with_score in sorted_min_scored_documents]

        sorted_product_scored_page_content = [doc_with_score.document for doc_with_score in sorted_product_scored_documents]

        sorted_rank_based_scored_page_content = [doc_with_score.document for doc_with_score in sorted_rank_based_scored_documents]

        sorted_linear_combination_scored_page_content = [doc_with_score.document for doc_with_score in sorted_linear_combination_scored_documents]

        mapping = {
        "average": sorted_average_scored_page_content,
        "weighted_average": sorted_weighted_average_scored_page_content,
        "max": sorted_max_scored_page_content,
        "min": sorted_min_scored_page_content,
        "product": sorted_product_scored_page_content,
        "rank_based": sorted_rank_based_scored_page_content,
        "linear_combination": sorted_linear_combination_scored_page_content
        }

        scor_result = mapping[parameter] # change this accordingly
        new_data_scor = put_retrieved_profiles(scor_result, idx)
        scor_val = LM(new_data_scor)
        scor_output.append(scor_val)
    return scor_output, bc_output, mv_output

### MAIN

In [50]:
scor_op, bc_op, mv_op = all_ensemble(ratio, start, end, "average")
rr_op = re_rank(ratio, start, end)

Token indices sequence length is longer than the specified maximum sequence length for this model (777 > 512). Running this sequence through the model will result in indexing errors


In [51]:
scor_op_wtavg = scoring_based(ratio, start, end, "weighted_average")
scor_op_max = scoring_based(ratio, start, end, "max")
scor_op_min = scoring_based(ratio, start, end, "min")
scor_op_prod = scoring_based(ratio, start, end, "product")
scor_op_rb = scoring_based(ratio, start, end, "rank_based")
scor_op_lc = scoring_based(ratio, start, end, "linear_combination")

In [None]:
dbert_output = []
for idx in range(start, end):
  documents = get_docs(idx)

  query = (data[idx]['input'])
  index_of_query= (data[idx]['input']).find("article:")

  query = (data[idx]['input'])[index_of_query + len("article:"):].strip()
  result = distil_bert(documents, query, ratio)
  new_data_bc = put_retrieved_profiles(result, idx)
  val = LM(new_data_bc)
  dbert_output.append(val)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
def process_item(item):
    
    processed_item = item.replace('[', '').replace(']', '')
    return processed_item

def matches(op, actual, start, end):
  cnt = 0
  for i in range(start, end):
    op[i] = process_item(op[i])
    if(op[i] == actual[i]):
      cnt+=1
  return cnt

In [None]:
ids = []
for i in range(start, end):
  ids.append(true_output[i]['id'])

true_op = []
for i in range(start, end):
  true_op.append(true_output[i]['output'])
  
all_models = [mv_op, bc_op, scor_op, rr_op, dbert_output, true_op]
all_models_str = ["mv_op", "bc_op", "scor_op", "rr_op", "dbert_output", "true_op"]

for i in range(len(all_models)):
  op = {
      "task": "LaMP_2",
      "golds": [{"id": id_value, "output": output_value} for id_value, output_value in zip(ids, all_models[i])]
  }
  with open(f'{all_models_str[i]}_d2_p1.json', 'w') as json_file:
      json.dump(op, json_file, indent=2)


OSError: [Errno 63] File name too long: "['religion', 'style & beauty', 'style & beauty', 'style & beauty', 'style & beauty', 'style & beauty', '[political]', 'religion', '[women]', '[women]', 'religion', 'religion', '[women]', 'style & beauty --', 'style & beauty --', 'style & beauty --', 'style & beauty --', 'style & beauty --', 'style & beauty --', 'style & beauty --', 'style & beauty', 'style & beauty --', 'style & beauty --', 'style & beauty --', 'style & beauty --']_d2_p1.json"

In [1]:
# print("Majority Vote Accuracy: ", accuracy_score(true_op, mv_op), "%")
# print("Borda Count Accuracy: ",  accuracy_score(true_op, bc_op), "%")
# print("Re ranking Accuracy: ", accuracy_score(true_op, rr_op), "%")
# print("Scoring Based Accuracy: ",  accuracy_score(true_op, scor_op), "%")
# print("Scoring Based wt avg: ", accuracy_score(true_op, scor_op_wtavg), "%")
# print("Scoring Based max: ", accuracy_score(true_op, scor_op_max), "%")
# print("Scoring Based min: ", accuracy_score(true_op, scor_op_min), "%")
# print("Scoring Based lc: ",accuracy_score(true_op, scor_op_lc), "%")
# print("Scoring Based prod: ", accuracy_score(true_op, scor_op_prod), "%")
# print("Scoring Based rb: ", accuracy_score(true_op, scor_op_rb), "%")
# print("Distil Bert Accuracy: ", accuracy_score(true_op, dbert_output))



Majority Vote Accuracy: 62.90%
Borda Count Accuracy: 64.59%
Re ranking Accuracy: 67.80%
Scoring Based Accuracy: 68.16%
Scoring Based wt avg: 70.40%
Scoring Based max: 46.70%
Scoring Based min: 68.90%
Scoring Based lc: 64.20%
Scoring Based prod: 55.84%
Scoring Based rb: 63.70%
Distil Bert Accuracy: 73.20
