# Bert Ranking

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
# Load the pre-trained BERT model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
f=open('docs.txt')
doc_str = f.read()
docs = doc_str.split(".I")

In [4]:
docs_data = []
for t in docs:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        docs_data.append({"I": i.strip(), "W": w.strip()})

df_docs = pd.DataFrame(docs_data)
df_docs.head()

Unnamed: 0,I,W
0,1,correlation between maternal and fetal plasma ...
1,2,changes of the nucleic acid and phospholipid l...
2,3,surfactant in fetal lamb tracheal fluid . ...
3,4,placental and cord blood lipids.. comparison i...
4,5,free fatty acid concentration in maternal plas...


In [5]:
f=open('queries.txt')
query_str = f.read()
queries = query_str.split(".I")

In [6]:
queries_data = []
for t in queries:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        queries_data.append({"I": i.strip(), "W": w.strip()})

df_queries = pd.DataFrame(queries_data)
df_queries.head()

Unnamed: 0,I,W
0,1,"the crystalline lens in vertebrates, including..."
1,2,the relationship of blood and cerebrospinal fl...
2,3,electron microscopy of lung or bronchi.
3,4,tissue culture of lung or bronchial neoplasms.
4,5,the crossing of fatty acids through the placen...


In [7]:
f=open('relevance.txt')
relevance_str = f.read()
relevance = relevance_str.strip().split("\n")

In [8]:
# Split each line into columns
rows = [list(map(float, line.strip().split())) for line in relevance]

# Create a DataFrame from the rows
df_relevance = pd.DataFrame(rows, columns=["query", "doc", "col3", "col4"])
df_relevance = df_relevance.drop(['col3', 'col4'], axis=1)

df_relevance = df_relevance.astype(int)
df_docs['I'] = df_docs['I'].astype(int)
df_queries['I'] = df_queries['I'].astype(int)

df_rele_doc = pd.merge(df_relevance, df_docs, left_on='doc', right_on='I')

df = pd.merge(df_rele_doc, df_queries, left_on='query', right_on='I')

df = df.rename(columns={'W_x':'docs', 'W_y':'queries'})

final_df = df[['docs', 'queries', 'doc', 'query']]

In [9]:
rank_list = []

In [10]:
for i in range(1, len(df_queries)+1):
    
    query = final_df.loc[final_df['query'] == i, 'queries'].iloc[0]
    documents = final_df.loc[final_df['query'] == i, 'docs'].to_list()
    # Encode the query and documents using the BERT tokenizer
    # Encode the query and documents using the BERT tokenizer
    input_ids_query = torch.tensor(tokenizer.encode(query, add_special_tokens=True, max_length=512)).unsqueeze(0) # add [CLS] token at the beginning
    input_ids_docs = [torch.tensor(tokenizer.encode(doc, add_special_tokens=True, max_length=512)).unsqueeze(0) for doc in documents]
    
    # Pass the encoded input to the BERT model to get the hidden states
    with torch.no_grad():
        hidden_states_query = model(input_ids_query)[0]
        hidden_states_docs = [model(input_ids_doc)[0] for input_ids_doc in input_ids_docs]
    
    # Compute the dot product between the query hidden state and each document hidden state to get the similarity scores
    similarity_scores = [torch.dot(hidden_states_query[0, 0, :], hidden_states_doc[0, 0, :]).item() for hidden_states_doc in hidden_states_docs]
    
    # Sort the documents based on the similarity scores to get the ranking
    ranking = [doc for _, doc in sorted(zip(similarity_scores, documents), key=lambda pair: pair[0], reverse=True)]
    
    rank_list.append(ranking)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
ranked_docs =  [element for innerList in rank_list for element in innerList]

In [12]:
len(ranked_docs)

696

In [13]:
ranked_docs_df = pd.DataFrame(ranked_docs).rename(columns={0: 'ranked_docs'})

In [15]:
final_ranking_df = pd.concat([final_df, ranked_docs_df], axis=1)

len(final_ranking_df)

696

In [16]:
final_ranking_df.to_csv('venwiz_bert_rank.csv', index=False)

In [None]:
final_ranking