In [1]:
# Importing libraries
import nltk
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize

In [2]:
# Load PubMedQA dataset from parquet files
def load_pubmedqa_dataset(parquet_files):
    df_list = []
    for file in parquet_files:
        df_list.append(pd.read_parquet(file))
    return pd.concat(df_list, ignore_index=True)

In [3]:
# PubMedQA parquet file
parquet_file = ['0000.parquet']

In [4]:
# Load PubMedQA dataset
pubmedqa_df = load_pubmedqa_dataset(parquet_file)

In [5]:
pubmedqa_df.head(10)

Unnamed: 0,pubid,question,context,long_answer,final_decision
0,25429730,Are group 2 innate lymphoid cells ( ILC2s ) in...,{'contexts': ['Chronic rhinosinusitis (CRS) is...,"As ILC2s are elevated in patients with CRSwNP,...",yes
1,25433161,Does vagus nerve contribute to the development...,{'contexts': ['Phosphatidylethanolamine N-meth...,Neuronal signals via the hepatic vagus nerve c...,yes
2,25445714,Does psammaplin A induce Sirtuin 1-dependent a...,{'contexts': ['Psammaplin A (PsA) is a natural...,PsA significantly inhibited MCF-7/adr cells pr...,yes
3,25431941,Is methylation of the FGFR2 gene associated wi...,{'contexts': ['This study examined links betwe...,We identified a novel biologically plausible c...,yes
4,25432519,Do tumor-infiltrating immune cell profiles and...,{'contexts': ['Tumor microenvironment immunity...,Breast cancer immune cell subpopulation profil...,yes
5,25440440,Is hidradenitis suppurativa a systemic disease...,{'contexts': ['Hidradenitis suppurativa (HS) i...,Control subjects were not validated for absenc...,yes
6,25444977,Does reference range for serum and salivary te...,{'contexts': ['The interassay variability foun...,"In men from the Mediterranean region, values o...",yes
7,25429648,Are secretory phospholipases A2 secreted from ...,{'contexts': ['Secretory phospholipases A2 (sP...,sPLA2 are secreted from ciliated cells and app...,yes
8,25424148,Is admission hyperglycemia associated with fai...,{'contexts': ['Hyperglycemia on admission is a...,"In patients with STEMI who undergo FT, admissi...",yes
9,25447560,Do systematic Reviews Published in Emergency M...,{'contexts': ['Publication bias compromises th...,Systematic reviews published in emergency medi...,no


In [6]:
pubmedqa_df.shape

(211269, 5)

In [7]:
pubmedqa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211269 entries, 0 to 211268
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   pubid           211269 non-null  int32 
 1   question        211269 non-null  object
 2   context         211269 non-null  object
 3   long_answer     211269 non-null  object
 4   final_decision  211269 non-null  object
dtypes: int32(1), object(4)
memory usage: 7.3+ MB


In [8]:
pubmedqa_df.describe()

Unnamed: 0,pubid
count,211269.0
mean,20341320.0
std,5302124.0
min,112569.0
25%,16987950.0
50%,21360560.0
75%,24741630.0
max,27868460.0


In [10]:
pubmedqa_df["question"].value_counts()

question
Does clotting factor concentrate given to prevent bleeding and bleeding-related complications in people with hemophilia A or B?                                                  3
Is serum human hepatocyte growth factor ( hHGF ) elevated in patients with metastatic gastric carcinoma?                                                                         2
Do the abdominal compartment syndrome in patients with burn injury?                                                                                                              2
Is microcirculation in the foot augmented by neuromuscular stimulation via the common peroneal nerve in different lower limb postures : a potential treatment for leg ulcers?    2
Are inflammatory markers unrelated to physical activity , performance , and functioning in hemodialysis?                                                                         2
                                                                                                

In [11]:
pubmedqa_df["long_answer"].value_counts()

long_answer
Single-center study.                                                                                                                                                                                                                                                                                                  8
Retrospective study.                                                                                                                                                                                                                                                                                                  5
Single-center, retrospective study.                                                                                                                                                                                                                                                                                   4
H                                                   

In [12]:
# Exploring the Questions and Answers columns
def questions_answers_stats(df):
    # Number of examples
    num_examples = len(df)
    
    # Question statistics
    question_lengths = df['question'].apply(lambda x: len(word_tokenize(x)))
    min_question_length = min(question_lengths)
    max_question_length = max(question_lengths)
    avg_question_length = np.mean(question_lengths)
    
    # Answer statistics
    answer_lengths = df['long_answer'].apply(lambda x: len(word_tokenize(x)))
    min_answer_length = min(answer_lengths)
    max_answer_length = max(answer_lengths)
    avg_answer_length = np.mean(answer_lengths)
    
    # Print statistics
    print("Number of examples:", num_examples)
    print("Question Statistics:")
    print("  Minimum question length:", min_question_length)
    print("  Maximum question length:", max_question_length)
    print("  Average question length:", avg_question_length)
    print("Answer Statistics:")
    print("  Minimum answer length:", min_answer_length)
    print("  Maximum answer length:", max_answer_length)
    print("  Average answer length:", avg_answer_length)

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Computing basic statistics
print("Basic Statistics:")
questions_answers_stats(pubmedqa_df)

Basic Statistics:
Number of examples: 211269
Question Statistics:
  Minimum question length: 4
  Maximum question length: 110
  Average question length: 16.345086122431592
Answer Statistics:
  Minimum answer length: 1
  Maximum answer length: 649
  Average answer length: 40.96555102736322


In [15]:
# Exploring the context column
def contexts(df):
    # Compute length of each context
    context_lengths = df['context'].apply(lambda x: len(x['contexts'][0].split()))

    # Descriptive statistics of context lengths
    print("Descriptive statistics of context lengths:")
    print("Minimum length:", min(context_lengths))
    print("Maximum length:", max(context_lengths))
    print("Mean length:", np.mean(context_lengths))

In [16]:
contexts(pubmedqa_df)

Descriptive statistics of context lengths:
Minimum length: 1
Maximum length: 661
Mean length: 50.267313235732644
