# Spacy

In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
# Load a pre-trained spaCy model
nlp = spacy.load('en_core_web_lg')

In [3]:
f=open('docs.txt')
doc_str = f.read()
docs = doc_str.split(".I")

In [4]:
docs_data = []
for t in docs:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        docs_data.append({"I": i.strip(), "W": w.strip()})

df_docs = pd.DataFrame(docs_data)
df_docs.head()

Unnamed: 0,I,W
0,1,correlation between maternal and fetal plasma ...
1,2,changes of the nucleic acid and phospholipid l...
2,3,surfactant in fetal lamb tracheal fluid . ...
3,4,placental and cord blood lipids.. comparison i...
4,5,free fatty acid concentration in maternal plas...


In [5]:
f=open('queries.txt')
query_str = f.read()
queries = query_str.split(".I")

In [6]:
queries_data = []
for t in queries:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        queries_data.append({"I": i.strip(), "W": w.strip()})

df_queries = pd.DataFrame(queries_data)
df_queries.head()

Unnamed: 0,I,W
0,1,"the crystalline lens in vertebrates, including..."
1,2,the relationship of blood and cerebrospinal fl...
2,3,electron microscopy of lung or bronchi.
3,4,tissue culture of lung or bronchial neoplasms.
4,5,the crossing of fatty acids through the placen...


In [7]:
f=open('relevance.txt')
relevance_str = f.read()
relevance = relevance_str.strip().split("\n")

In [8]:
# Split each line into columns
rows = [list(map(float, line.strip().split())) for line in relevance]

# Create a DataFrame from the rows
df_relevance = pd.DataFrame(rows, columns=["query", "doc", "col3", "col4"])
df_relevance = df_relevance.drop(['col3', 'col4'], axis=1)

df_relevance = df_relevance.astype(int)
df_docs['I'] = df_docs['I'].astype(int)
df_queries['I'] = df_queries['I'].astype(int)

df_rele_doc = pd.merge(df_relevance, df_docs, left_on='doc', right_on='I')

df = pd.merge(df_rele_doc, df_queries, left_on='query', right_on='I')

df = df.rename(columns={'W_x':'docs', 'W_y':'queries'})

final_df = df[['docs', 'queries', 'doc', 'query']]

In [11]:
sorted_docs = []
relevance_scores =[]

In [12]:
for i in range(1, len(df_queries)+1):
    
    query = final_df.loc[final_df['query'] == i, 'queries'].iloc[0]
    documents = final_df.loc[final_df['query'] == i, 'docs'].to_list()
    # Embed the documents using spaCy's document vectors
    doc_vectors = [nlp(doc).vector for doc in documents]
    
    # Embed the query using spaCy's document vectors
    query_vector = nlp(query).vector
    
    # Calculate the cosine similarity between the query and each document
    similarities = [np.dot(query_vector, doc_vector) / (np.linalg.norm(query_vector) * np.linalg.norm(doc_vector)) for doc_vector in doc_vectors]
    
    # Rank the documents based on their similarity to the query
    ranked_docs = sorted(list(zip(documents, similarities)), key=lambda x: x[1], reverse=True)
    
    for t in ranked_docs:
      x, y = t
      sorted_docs.append(x)
      relevance_scores.append(y)

In [13]:
sorted_docs_df = pd.DataFrame(sorted_docs).rename(columns={0:'sorted_docs'})

In [14]:
relevance_scores_df = pd.DataFrame(relevance_scores).rename(columns={0:'relevance_scores'})

In [15]:
ranking_df = pd.concat([final_df, sorted_docs_df, relevance_scores_df], axis=1)

In [16]:
ranking_df.to_csv('venwiz_spacy_rank.csv', index=False)