## Libraries and Data Downloading

In [1]:
import os
import glob
import pickle
import pandas as pd

# Downloading the Clinical trials 2021 collection [Start Here]

2 GBs --> 20 sec

In [2]:
!gdown --id 1oi3mnz6PQVt-tEMR6IQnqC0ab9IZ1iXx

Access denied with the following error:



 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1oi3mnz6PQVt-tEMR6IQnqC0ab9IZ1iXx 



# Load the Dataframe

2 GBs --> 18 secs, 375580 Documents

In [3]:
import pandas as pd
import pickle

# Specify the path to the pickle file
pickle_file_path = '/content/extracted_information.pkl'

# Load the data from the pickle file into a list of dictionaries
with open(pickle_file_path, 'rb') as file:
    extracted_data = []
    while True:
        try:
            document = pickle.load(file)
            extracted_data.extend(document)
        except EOFError:
            break

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(extracted_data)

# Now 'df' is a Pandas DataFrame that contains the extracted information from the XML files.
# You can use standard Pandas DataFrame operations to analyze and manipulate the data.
# For example, you can print the first few rows of the DataFrame:
display(df.head())
print(f'Total number of documents: {len(df)}')

FileNotFoundError: [Errno 2] No such file or directory: '/content/extracted_information.pkl'

# Analyze the data


In [None]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows

df.describe()

### Count how many times a terms appears in the dataframe

In [None]:
# Replace 'target_term' with the term you want to search for
target_term = 'target_term'

# Count occurrences of target_term in text_column
term_count = df['detailed_description'].str.count(target_term).sum()
print(f"The term '{target_term}' appears {term_count} times in the 'detailed_description'.")

### Display unique values of a DataFrame

In [None]:
unique_values = df['study_type'].unique()
print("Unique values in 'study_type':")
print(unique_values)

### Display some information

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

def plot_top_words_frequency(data_frame, column_name, top_n=13):
    # Extract text from the specified column
    text_data = " ".join(data_frame[column_name])

    # Tokenization and preprocessing
    tokens = word_tokenize(text_data.lower())
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Calculate word frequencies
    word_freq = Counter(tokens)

    # Get the top N most common words
    top_words = word_freq.most_common(top_n)

    # Extract words and frequencies for plotting
    words = [word for word, freq in top_words]
    frequencies = [freq for word, freq in top_words]

    # Set seaborn style
    sns.set(style="whitegrid")

    # Create a bar chart using Seaborn
    plt.figure(figsize=(10, 6))
    sns.barplot(x=frequencies, y=words, palette="Blues_d")
    plt.xlabel("Frequency")
    plt.ylabel("Words")
    plt.title(f"Top {top_n} Most Common Words in '{column_name}'")
    plt.tight_layout()

    # Show the plot
    plt.show()

# Call the function to plot top word frequencies from the 'text_column'
plot_top_words_frequency(df, 'study_design_info_primary_purpose')

# Indexing the collection

Installing PyTerrier

In [None]:
!pip install python-terrier
import pyterrier as pt
pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

if not pt.started():
  pt.init()

from pyterrier.measures import *

Creating a field to index per document

In [None]:
# Display the column names in your DataFrame
print(df.columns)


In [None]:
df[['detailed_description', 'nct_id']]
df=df.rename(columns={"nct_id" : "docno"})
#df=df.head(1000)

In the following cell, you can index the dataframe's documents. The index, with all its data structures, is written into a directory called `index`.

[10 minutes to index the whole Collection]

In [None]:
# The following line allows to set a property in Terrier’s global properties configuration. Example:

## Specify where, and if it should overwrite other indices
indexer = pt.DFIndexer("./index_sampledocs", overwrite=True, stemmer= None, stopwords=True) #PorterStemmer

## What it should index? This is a reference to the index
index_ref = indexer.index(df["detailed_description"], df["docno"])
print(index_ref.toString())

## Printing the files related to the index
!ls -lh index_sampledocs/

Printing some statistics

In [None]:
# Load the index, print the statistics
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())
print(index.getMetaIndex().getKeys())

Ok, so this object refers to Terrier's [`Index`](http://terrier.org/docs/current/javadoc/org/terrier/structures/Index.html) type. Check the linked Javadoc – you will see that this Java object has methods such as:
 - `getCollectionStatistics()`
 - `getInvertedIndex()`
 - `getLexicon()`

In [None]:
index = pt.IndexFactory.of(index_ref)

Terrier removes standard stopwords and applies Porter's stemmer by default.

Further:
 - `Nt` is the number of unique documents that each term occurs in – this is useful for calculating IDF.
 - `TF` is the total number of occurrences – some weighting models use this instead of Nt.
 - The numbers in the `@{}` are a pointer – they tell Terrier where the postings are for that term in the inverted index data structure.



## Getting term statistics:
One can use the square bracket notation to lookup terms in Terrier's lexicon:


In [None]:
print(index.getCollectionStatistics().toString())

for kv in index.getLexicon():
  print("%s -> %s" % (kv.getKey(), kv.getValue().toString()) )

### Searching an Index

One way to search in PyTerrier is called `BatchRetrieve`. BatchRetrieve is configured by specifying an index and a weighting model (`Tf` in our example).

MODELS: http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html

In [None]:
# Setting the retrieval pipeline and model
br = pt.BatchRetrieve(index, wmodel="TF_IDF")
#As I am using a single query (that I manually provide I use search)
br.search("injury")

So the `search()` method returns a dataframe with columns:
 - `qid`: this is by default "1", since it's our first and only query
 - `docid`: Terrier' internal integer for each document
 - `docno`: the external (string) unique identifier for each document
 - `score`: since we use the `Tf` weighting model, this score corresponds the total frequency of the query (terms) in each document
 - `rank`: A handy attribute showing the descending order by score
 - `query`: the input query

### Experiment with Query Language

- Based on the query language supported by Terrier, create 2- 3 adhoc queries (by hand).
- https://github.com/terrier-org/terrier-core/blob/5.x/doc/querylanguage.md

- Use different relevance models and different query languages to
search for your queries in the collection.
- http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html

# Loading queries and qrels

We will use ir_datasets to obtain the queries and the qrels
https://ir-datasets.com/

In [None]:
"""pip install beautifulsoup4"""


In [None]:
"""import xml.etree.ElementTree as ET
import pandas as pd

# XML data
xml_data = '''
<topics task="2021 TREC Clinical Trials">
    <topic number="1"> A 19-year-old male came to clinic with some sexual concern. He recently engaged in a relationship and is worried about the satisfaction of his girlfriend. He has a "baby face" according to his girlfriend's statement and he is not as muscular as his classmates. On physical examination, there is some pubic hair and poorly developed secondary sexual characteristics. He is unable to detect coffee smell during the examination, but the visual acuity is normal. Ultrasound reveals the testes volume of 1-2 ml. The hormonal evaluation showed serum testosterone level of 65 ng/dL with low levels of GnRH. </topic>
    <topic number="2"> A 32-year-old woman comes to the hospital with vaginal spotting. Her last menstrual period was 10 weeks ago. She has regular menses lasting for 6 days and repeating every 29 days. Medical history is significant for appendectomy and several complicated UTIs. She has multiple male partners, and she is inconsistent with using barrier contraceptives. Vital signs are normal. Serum β-hCG level is 1800 mIU/mL, and a repeat level after 2 days shows an abnormal rise to 2100 mIU/mL. Pelvic ultrasound reveals a thin endometrium with no gestational sac in the uterus. </topic>
</topics>
'''

# Parse XML data
root = ET.fromstring(xml_data)

# Extract topics into a DataFrame
topics_data = []
for topic_elem in root.findall('.//topic'):
    number = topic_elem.get('number')
    text = topic_elem.text.strip()
    topics_data.append({'number': number, 'text': text})

# Create DataFrame
topics_df = pd.DataFrame(topics_data)

# Display the DataFrame
print(topics_df)
"""

In [None]:
"""# Queries
#!pip install --upgrade ir_datasets
dataset = pt.get_dataset(topics_df)
queries = dataset.get_topics(variant='text')
display(queries)

#Qrels
!gdown --id 1RYHxr2sM9Hd2C2iRI_NXzO4RY71Adu-p

path_to_qrels = 'clinical_qrels22.txt'
qrels = pd.read_csv(path_to_qrels, names=['qid','Q0','docno','label'],sep=" ",header=None)
qrels = qrels.drop(columns=['Q0'])
qrels["qid"] = qrels["qid"].astype(str)
qrels["docno"] = qrels["docno"].astype(str)
display(qrels.head(2))
"""

In [None]:
# Queries
!pip install --upgrade ir_datasets
dataset = pt.get_dataset('irds:clinicaltrials/2021/trec-ct-2022')
queries = dataset.get_topics(variant='text')
display(queries)

#Qrels
!gdown --id 1RYHxr2sM9Hd2C2iRI_NXzO4RY71Adu-p

path_to_qrels = 'clinical_qrels22.txt'
qrels = pd.read_csv(path_to_qrels, names=['qid','Q0','docno','label'],sep=" ",header=None)
qrels = qrels.drop(columns=['Q0'])
qrels["qid"] = qrels["qid"].astype(str)
qrels["docno"] = qrels["docno"].astype(str)
display(qrels.head(2))

# Putting everything together

In [None]:
## Set a retrieval model or pipeline

## Evaluation Measures
####Relevance labels of 2 are considered as relevant
metrics_eval = [
 RR(rel=2)@1000,
 P(rel=2)@1 , P(rel=2)@5 , P(rel=2)@10 , P(rel=2)@25 , P(rel=2)@30, P(rel=2)@75,
 Rprec(rel=2),
 R(rel=2)@10, R(rel=2)@25,
]

## Perform retrieval
#Evaluate
results_all_des = pt.Experiment(
    [bm25],
    queries,
    qrels,
    eval_metrics=metrics_eval,
    names=["bm25"
           ],
    baseline=0,
    perquery = False,
)
display(results_all_des)

## ui

In [None]:
pip install flask


In [None]:
from flask import Flask, render_template, request
import pandas as pd
import pickle

app = Flask(__name__)

# Load your DataFrame and necessary functions here
# For example, load the DataFrame from the pickled file

# Load the DataFrame
pickle_file_path = 'path_to_your_pickle_file.pkl'
with open(pickle_file_path, 'rb') as file:
    df = pickle.load(file)

# Function to perform search
def perform_search(query):
    # Perform search operation in your DataFrame
    # Replace this with your actual search logic
    results = df[df['detailed_description'].str.contains(query, case=False)]
    return results

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/search', methods=['GET'])
def search():
    query = request.args.get('query')
    results = perform_search(query)
    return render_template('results.html', query=query, results=results.to_dict(orient='records'))

if __name__ == '__main__':
    app.run(debug=True)
