In [267]:
!pip install pandas numpy



In [268]:
import pandas as pd
import numpy as np

In [269]:
def parse_articles_from_gz(number):
    number_str = str(number).zfill(4)
    url = f"https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n{number_str}.xml.gz"
    print(f"Downloading {url} ...")
    articles = []
    
    try:
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
            context = ET.iterparse(f, events=('end',))
            for event, elem in context:
                if elem.tag == 'PubmedArticle':
                    pmid_elem = elem.find('.//PMID')
                    title_elem = elem.find('.//ArticleTitle')
                    abstract_elem = elem.find('.//Abstract/AbstractText')
                    
                    pmid = pmid_elem.text if pmid_elem is not None else None
                    title = title_elem.text if title_elem is not None else None
                    abstract = abstract_elem.text if abstract_elem is not None else None
                    
                    articles.append((pmid, title, abstract))
                    
                    elem.clear()
    except Exception as e:
        print(f"Failed to process {url}: {e}")
    
    return articles


In [270]:
def collect_all_articles():
    all_articles = []
    numbers = list(range(1, 1275))  # From 0001 to 1274

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(parse_articles_from_gz, number) for number in numbers]
        
        for future in as_completed(futures):
            articles = future.result()
            all_articles.extend(articles)
    
    return all_articles


In [None]:
all_articles = collect_all_articles()

df = pd.DataFrame(all_articles, columns=["PMID", "ArticleTitle", "Abstract"])
print(f"Total articles collected: {len(df)}")


df.to_csv('pubmed_articles_parallel.csv', index=False)

In [279]:
from preprocess_utils import get_topics_pids_df

training_data_paths = [
    "data/DATA2019/Training/DTA",
    "data/DATA2019/Training/Intervention",
]

initial_training_df = get_topics_pids_df(training_data_paths)

test_data_paths = [
    "data/DATA2019/Testing/DTA",
    "data/DATA2019/Testing/Intervention",
]

initial_test_df = get_topics_pids_df(test_data_paths)

initial_training_df['PID'] = initial_training_df['PID'].astype(str)
initial_test_df['PID'] = initial_test_df['PID'].astype(str)

Could not parse topic CD011134


In [280]:
articles1 = pd.read_csv('data/articles/articles_test.csv', sep=',')
articles2 = pd.read_csv('data/articles/random_articles.csv', sep=',')

articles1['PMID'] = articles1['PMID'].astype(str)
articles2['PMID'] = articles2['PMID'].astype(str)

articles = pd.concat([articles1, articles2], ignore_index=True)

In [281]:
train_df = pd.merge(initial_training_df, articles, left_on='PID', right_on='PMID', how='left')
train_df = train_df.dropna(subset=['ArticleTitle', 'Abstract'])

train_df['relevance'] = 1
train_df['article_title'] = train_df['ArticleTitle']
train_df['abstract'] = train_df['Abstract']

del train_df['PMID']
del train_df['ArticleTitle']
del train_df['Abstract']


test_df = pd.merge(initial_test_df, articles, left_on='PID', right_on='PMID', how='left')
test_df = test_df.dropna(subset=['ArticleTitle', 'Abstract'])

test_df['relevance'] = 1
test_df['article_title'] = test_df['ArticleTitle']
test_df['abstract'] = test_df['Abstract']

del test_df['PMID']
del test_df['ArticleTitle']
del test_df['Abstract']

In [None]:
# We need to generate misfitting articles, articles that are not in our training data

def produce_irrelevant_citations(df):
    possible_misfits = articles[
        ~articles['PMID'].isin(df['PID']) & 
        articles['Abstract'].notna()
    ]
    
    # Naively assume randomly picked topic is not relevant to sampled article
    irrelevant = df.sample(n=len(df)).reset_index()
    misfits = possible_misfits.sample(n=len(df)).reset_index()
    
    
    irrelevant['article_title'] = misfits['ArticleTitle']
    irrelevant['abstract'] = misfits['Abstract']
    irrelevant['relevance'] = 0
    irrelevant['PID'] = misfits['PMID']
    
    
    out = pd.concat([df, irrelevant], ignore_index=True)
    del out['index']
    
    return out


produce_irrelevant_citations(train_df).to_csv('data/train.csv', index=False)
produce_irrelevant_citations(test_df).to_csv('data/test.csv', index=False)