In [1]:
import json
import math
import os
import pandas as pd
import pickle
import re
import requests
import time

from bs4 import BeautifulSoup
from collections import defaultdict
from random import sample
from sklearn.utils import shuffle
from tqdm.notebook import tqdm

In [2]:
filepath = '.'

with open(filepath + '/credentials/pubmed_credentials.json', mode='r') as file:
    pubmed_credentials = json.load(file)

# Create the pharmacy dataset

Create the dataset as a defaultdict so that we can add keys and assign them values at the same time

In [3]:
pharm_dataset = defaultdict()

Loop through the files from the Impact Pharmacie website and isolate PMIDs using regex, use PMIDs as dict keys and assign the value as a defaultdict containing the URPP rating

In [4]:
pmid_pattern = re.compile('(?<=<pmid>PMID)\S+(?=\s*<\/pmid>)')

for file in os.listdir(filepath + '/data/first_gen/score'):
    filename = os.fsdecode(file)
    with open(filepath + '/data/first_gen/score/'+ filename, mode='r', encoding='utf-8') as f:
        filecontents = f.read()
        pmids = re.findall(pmid_pattern, filecontents)
        for pmid in pmids:
            pharm_dataset[pmid] = defaultdict()
            pharm_dataset[pmid]['code_urpp'] = filename.split('.')[0]

for file in os.listdir(filepath + '/data/first_gen/design'):
    filename = os.fsdecode(file)
    design = filename.split('.')[0]
    for pmid, pmid_data in pharm_dataset.items():
        pharm_dataset[pmid][design] = 0
    with open(filepath + '/data/first_gen/design/'+ filename, mode='r', encoding='utf-8') as f:
        filecontents = f.read()
        pmids = re.findall(pmid_pattern, filecontents)
        for pmid in pmids:
            try:
                pharm_dataset[pmid][design] = 1 
            except:
                pharm_dataset[pmid] = defaultdict()
                pharm_dataset[pmid][design] = 1

for file in os.listdir(filepath + '/data/first_gen/field'):
    filename = os.fsdecode(file)
    field = filename.split('.')[0]
    for pmid, pmid_data in pharm_dataset.items():
        pharm_dataset[pmid][field] = 0
    with open(filepath + '/data/first_gen/field/'+ filename, mode='r', encoding='utf-8') as f:
        filecontents = f.read()
        pmids = re.findall(pmid_pattern, filecontents)
        for pmid in pmids:
            try:
                pharm_dataset[pmid][field] = 1 
            except:
                pharm_dataset[pmid] = defaultdict()
                pharm_dataset[pmid][field] = 1

for file in os.listdir(filepath + '/data/first_gen/setting'):
    filename = os.fsdecode(file)
    setting = filename.split('.')[0]
    for pmid, pmid_data in pharm_dataset.items():
        pharm_dataset[pmid][setting] = 0
    with open(filepath + '/data/first_gen/setting/'+ filename, mode='r', encoding='utf-8') as f:
        filecontents = f.read()
        pmids = re.findall(pmid_pattern, filecontents)
        for pmid in pmids:
            try:
                pharm_dataset[pmid][setting] = 1 
            except:
                pharm_dataset[pmid] = defaultdict()
                pharm_dataset[pmid][setting] = 1

Query the PubMed eFetch API to obtain the pubmed data for each PMID (represented as Beautiful soup).
eFetch has a maximum of 3 queries per second so space out the queries using time.sleep

In [5]:
for pmid,_ in tqdm(list(pharm_dataset.items())):
    params = {'db':'pubmed', 'id':pmid, 'retmode':'xml', 'tool':pubmed_credentials['pubmed_tool_name'], 'email':pubmed_credentials['pubmed_tool_email']}
    r = requests.get(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=params)
    try:
        pharm_dataset[pmid]['pmdata'] = str(BeautifulSoup(r.content))
    except:
        pharm_dataset[pmid]['pmdata'] = None
        print('ERROR on call for pmid {}'.format(pmid))
    time.sleep(0.35)

  0%|          | 0/2478 [00:00<?, ?it/s]

Verify the data (number of successful PubMedArticleSet vs None)

In [6]:
print('Length of dataset: {}'.format(len(pharm_dataset)))
n_pubmed = 0
n_none = 0
for pmid, data in pharm_dataset.items():
  if data != None:
    n_pubmed += 1
  else:
    n_none += 1
print('Number of successfully retrieved PubMed sets: {}'.format(n_pubmed))
print('Number of errors: {}'.format(n_none))

Length of dataset: 2478
Number of successfully retrieved PubMed sets: 2478
Number of errors: 0


Split in train/val and test sets

In [7]:
test_set_ratio = 0.2
ids_to_split = list(pharm_dataset.keys())
k_test = math.ceil(len(ids_to_split) * test_set_ratio)
test_ids = sample(ids_to_split, k_test)
pharm_ds_train = defaultdict()
pharm_ds_test = defaultdict()
for pmid, data in pharm_dataset.items():
  if pmid in test_ids:
    pharm_ds_test[pmid] = data
  else:
    pharm_ds_train[pmid] = data
print('Number of items in pharmacy dataset: {}'.format(len(pharm_dataset)))
print('Number of items in train/val dataset: {}'.format(len(pharm_ds_train)))
print('Number of items in test dataset: {}'.format(len(pharm_ds_test)))

Number of items in pharmacy dataset: 2478
Number of items in train/val dataset: 1982
Number of items in test dataset: 496


Save the datasets

In [8]:
with open(filepath + '/data/first_gen/pharm_dataset.pkl', mode='wb') as file:
    pickle.dump(pharm_dataset, file)
with open(filepath + '/data/first_gen/pharm_ds_train.pkl', mode='wb') as file:
    pickle.dump(pharm_ds_train, file)
with open(filepath + '/data/first_gen/pharm_ds_test.pkl', mode='wb') as file:
    pickle.dump(pharm_ds_test, file)

# Impact Pharmacie Query Dataset

In [None]:
# IMPACT PHARMACIE QUERIES
# "impact"[title] OR "effect"[title] OR "role"[title] OR "interventions"[title] AND ("pharmacists"[All Fields] OR "pharmacists"[title] OR "pharmacist"[title] OR "pharmacy"[title]) AND "2020/03/01 15.00"[MHDA] : "2021/05/12 15.00"[MHDA]
# search_query = 'impact[title] OR effect[title] OR role[title] OR interventions[title] AND (pharmacists[All Fields] OR pharmacists[title] OR pharmacist[title] OR pharmacy[title])'

search_query = 'pharmacists[All Fields] OR pharmacist[All Fields] OR pharmacy[title]'

Get the results from the impact pharmacie query

In [None]:
# QUERY MODEL
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&mindate=2016&maxdate=2019&retstart=0&retmax=100000

sampling_start_year = 1988
sampling_end_year = '2021/06/01'

params = {'db':'pubmed', 'term':search_query, 'mindate':sampling_start_year, 'maxdate':sampling_end_year, 'tool':pubmed_credentials['pubmed_tool_name'], 'email':pubmed_credentials['pubmed_tool_email']}
r = requests.get(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=params)
n_results = BeautifulSoup(r.content).find('count').get_text()
print(n_results)

45178


Get the PMIDs (max 100 000 per API call)

In [None]:
pmids = []
n_calls = math.ceil(int(n_results) / 100000)
for i in tqdm(range(n_calls)):
  params = {'db':'pubmed', 'term':search_query, 'mindate':sampling_start_year, 'maxdate':sampling_end_year, 'retstart':i*100000, 'retmax':100000, 'tool':pubmed_credentials['pubmed_tool_name'], 'email':pubmed_credentials['pubmed_tool_email']}
  r = requests.get(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=params)
  results = BeautifulSoup(r.content)
  pmids_call = results.find_all('id')
  for id in pmids_call:
      pmids.append(id.get_text())
  time.sleep(0.35)
print('Number of pmids retrieved: {}'.format(len(pmids)))

  0%|          | 0/1 [00:00<?, ?it/s]

Number of pmids retrieved: 45178


Check how many PMIDS are in Impact Pharmacie but not returned by the search query

In [None]:
pmids_not_returned = [pmid for pmid in pharm_dataset.keys() if pmid not in pmids]
print('Number of PMIDs in Impact Pharmacie that are not returned by search query: {}'.format(len(pmids_not_returned)))
print('Sample of PMIDs not returned: {}'.format(sample(pmids_not_returned, 5)))

Number of PMIDs in Impact Pharmacie that are not returned by search query: 70
Sample of PMIDs not returned: ['11575979', '23581450', '12298108', '10199962', '11119944']


Build the list of PMIDs returned by the query but not in Impact Pharmacie

In [None]:
returned_no_impact_pmids = [pmid for pmid in pmids if pmid not in pharm_dataset.keys()]

Verify that the list has no Impact Pharmacie PMIDs (must output True

In [None]:
print('Number of PMIDs in pharmacy dataset: {}'.format(len(pharm_dataset.keys())))
print('Number of PMIDs in returned no impact dataset: {}'.format(len(returned_no_impact_pmids)))

set_pharm = set(list(pharm_dataset.keys()))
set_noretimpact = set(returned_no_impact_pmids)
print(set_pharm.intersection(set_noretimpact) == set())

Number of PMIDs in pharmacy dataset: 2473
Number of PMIDs in returned no impact dataset: 42775
True


Build the dataset

In [None]:
noimpact_dataset = defaultdict()
for pmid in tqdm(returned_no_impact_pmids):
  noimpact_dataset[pmid] = defaultdict()
  params = {'db':'pubmed', 'id':pmid, 'retmode':'xml', 'tool':pubmed_credentials['pubmed_tool_name'], 'email':pubmed_credentials['pubmed_tool_email']}
  r = requests.get(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=params)
  try:
      noimpact_dataset[pmid]['pmdata'] = str(BeautifulSoup(r.content))
  except:
      noimpact_dataset[pmid]['pmdata'] = None
      print('ERROR on call for pmid {}'.format(pmid))
  time.sleep(0.35)

  0%|          | 0/42775 [00:00<?, ?it/s]

Verify

In [None]:
print('Length of dataset: {}'.format(len(noimpact_dataset)))
n_pubmed = 0
n_none = 0
for pmid, data in noimpact_dataset.items():
  if data != None:
    n_pubmed += 1
  else:
    n_none += 1
print('Number of successfully retrieved PubMed sets: {}'.format(n_pubmed))
print('Number of errors: {}'.format(n_none))

Length of dataset: 42775
Number of successfully retrieved PubMed sets: 42775
Number of errors: 0


Split

In [None]:
test_set_ratio = 0.2
ids_to_split = list(noimpact_dataset.keys())
k_test = math.ceil(len(ids_to_split) * test_set_ratio)
test_ids = sample(ids_to_split, k_test)
noimpact_ds_train = defaultdict()
noimpact_ds_test = defaultdict()
for pmid, data in noimpact_dataset.items():
  if pmid in test_ids:
    noimpact_ds_test[pmid] = data
  else:
    noimpact_ds_train[pmid] = data
print('Number of items in no impact dataset: {}'.format(len(noimpact_dataset)))
print('Number of items in train/val dataset: {}'.format(len(noimpact_ds_train)))
print('Number of items in test dataset: {}'.format(len(noimpact_ds_test)))

Number of items in no impact dataset: 42775
Number of items in train/val dataset: 34220
Number of items in test dataset: 8555


Save

In [None]:
with open(filepath + '/data/first_gen/noimpact_dataset.pkl', mode='wb') as file:
    pickle.dump(noimpact_dataset, file)
with open(filepath + '/data/first_gen/noimpact_ds_train.pkl', mode='wb') as file:
    pickle.dump(noimpact_ds_train, file)
with open(filepath + '/data/first_gen/noimpact_ds_test.pkl', mode='wb') as file:
    pickle.dump(noimpact_ds_test, file)

# Convert to dataframes

Load the datasets

In [9]:
def load_dataset(filepath):
  with open(filepath, mode='rb') as file:
    dataset = pickle.load(file)
  return dataset

pharm_train_ds = load_dataset(filepath + '/data/first_gen/pharm_ds_train.pkl')
pharm_test_ds = load_dataset(filepath + '/data/first_gen/pharm_ds_test.pkl')
noimpact_train_ds = load_dataset(filepath + '/data/first_gen/noimpact_ds_train.pkl')
noimpact_test_ds = load_dataset(filepath + '/data/first_gen/noimpact_ds_test.pkl')

Process the PubMed Data into text strings containing the title and abstract

In [10]:
for dataset in [pharm_train_ds, pharm_test_ds, noimpact_train_ds, noimpact_test_ds]:
  for pmid, data in dataset.items():
    title = ''
    text = ''
    titleandtext = ''
    if data['pmdata'] == None:
        dataset[pmid]['text'] = ''
    else:
        element_pmdata = BeautifulSoup(data['pmdata'])
        pubyear = []
        try:
          for pubdate in element_pmdata.find_all('pubdate'):
            pubyear.append(pubdate.find('year').get_text())
        except:
          pass
        if len(pubyear) == 0:
          dataset[pmid]['pubyear'] = None
        else:
          dataset[pmid]['pubyear'] = pubyear[0]
        try:
          t = element_pmdata.find_all('articletitle')
          title = [e.get_text() for e in t][0]
          if title == None:
            title = ''
        except:
          title = ''
        labels = [e['label'] if 'label' in e.attrs.keys() else '' for e in element_pmdata.find_all('abstracttext')]
        text = [e.get_text() for e in element_pmdata.find_all('abstracttext')]
        labelsandtext = ' '.join([' '.join([l,t]) for l,t in zip(labels, text)])
        titleandtext = title + ' ' + labelsandtext  
        dataset[pmid]['text'] = titleandtext

Verify the data

In [11]:
for dataset_name, dataset in {'Pharmacy train dataset':pharm_train_ds, 'Pharmacy test dataset':pharm_test_ds, 'No impact train dataset':noimpact_train_ds, 'No impact test dataset':noimpact_test_ds}.items():
  print('Verification of dataset: {}'.format(dataset_name))
  print('Number of elements in dataset: {}'.format(len(dataset)))
  text_data_available = 0
  for _,data in dataset.items():
    if data['text'] == '' or data['text'] == ' ':
      continue
    else:
      text_data_available += 1
  print('Number of elements with text data: {}'.format(text_data_available))
  print('')

Verification of dataset: Pharmacy train dataset
Number of elements in dataset: 1982
Number of elements with text data: 1982

Verification of dataset: Pharmacy test dataset
Number of elements in dataset: 496
Number of elements with text data: 496

Verification of dataset: No impact train dataset
Number of elements in dataset: 34220
Number of elements with text data: 34191

Verification of dataset: No impact test dataset
Number of elements in dataset: 8555
Number of elements with text data: 8547



Exclude elements with no text

In [12]:
def filter_dataset(dataset):
  return_dataset = defaultdict()
  for key,data in dataset.items():
    if data['text'] == '' or data['text'] == ' ':
      continue
    else:
      return_dataset[key] = data
  return return_dataset


pharm_train_ds = filter_dataset(pharm_train_ds)
pharm_test_ds = filter_dataset(pharm_test_ds)
noimpact_train_ds = filter_dataset(noimpact_train_ds)
noimpact_test_ds = filter_dataset(noimpact_test_ds)

for dataset_name, dataset in {'Pharmacy train dataset':pharm_train_ds, 'Pharmacy test dataset':pharm_test_ds, 'No impact train dataset':noimpact_train_ds, 'No impact test dataset':noimpact_test_ds}.items():
  print('Verification of dataset: {}'.format(dataset_name))
  print('Number of elements in dataset: {}'.format(len(dataset)))
  text_data_available = 0
  for _,data in dataset.items():
    if data['text'] == '':
      continue
    else:
      text_data_available += 1
  print('Number of elements with text data: {}'.format(text_data_available))
  print('')

Verification of dataset: Pharmacy train dataset
Number of elements in dataset: 1982
Number of elements with text data: 1982

Verification of dataset: Pharmacy test dataset
Number of elements in dataset: 496
Number of elements with text data: 496

Verification of dataset: No impact train dataset
Number of elements in dataset: 34191
Number of elements with text data: 34191

Verification of dataset: No impact test dataset
Number of elements in dataset: 8547
Number of elements with text data: 8547



In [13]:
pharm_train_df = pd.DataFrame.from_dict(pharm_train_ds, orient='index', columns=list(list(pharm_train_ds.values())[0].keys()))
pharm_train_df['eligible_impact'] = 1

pharm_test_df = pd.DataFrame.from_dict(pharm_test_ds, orient='index', columns=list(list(pharm_train_ds.values())[0].keys()))
pharm_test_df['eligible_impact'] = 1

noimpact_train_df = pd.DataFrame.from_dict(noimpact_train_ds, orient='index', columns=['code_urpp','text','pubyear'])
noimpact_train_df['eligible_impact'] = 0
noimpact_train_df['code_urpp'] = 0

noimpact_test_df = pd.DataFrame.from_dict(noimpact_test_ds, orient='index', columns=['code_urpp','text','pubyear'])
noimpact_test_df['eligible_impact'] = 0
noimpact_test_df['code_urpp'] = 0

train_df = pharm_train_df.append(noimpact_train_df)
train_df = shuffle(train_df)
test_df = pharm_test_df.append(noimpact_test_df)
test_df = shuffle(test_df)

Verify the dataframes

In [14]:
train_df.head(5)

Unnamed: 0,code_urpp,casecontrol,cohortpro,cohortretro,metaanalysis,prepost,quasirando,rando,sysreview,anticoag,...,transplant,vaccin,ambulatory,community,hospital,mixed,pmdata,pubyear,text,eligible_impact
27496275,0,,,,,,,,,,...,,,,,,,,2016.0,Are state laws granting pharmacists authority ...,0
20484206,0,,,,,,,,,,...,,,,,,,,2010.0,New USP chapter addresses physical environment...,0
26131046,c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",,Pattern of drug therapy problems and intervent...,1
31532503,0,,,,,,,,,,...,,,,,,,,2019.0,Pharmacokinetic considerations in pediatric ph...,0
29069531,0,,,,,,,,,,...,,,,,,,,2018.0,Factors associated with the variability of cal...,0


In [15]:
train_df['pubyear'].value_counts()

2020    2808
2019    2332
2018    2044
2016    1791
2017    1784
2021    1710
2015    1569
2014    1544
2013    1308
2012    1265
2011    1145
2010    1028
2009     918
2008     825
2007     816
2006     698
2005     624
2003     559
2004     516
2002     475
2000     455
2001     443
1994     434
1996     415
1995     409
1999     401
1992     389
1998     380
1993     371
1997     332
1991     324
1990     322
1989     315
1988     262
1987      16
Name: pubyear, dtype: int64

In [16]:
train_df.tail(5)

Unnamed: 0,code_urpp,casecontrol,cohortpro,cohortretro,metaanalysis,prepost,quasirando,rando,sysreview,anticoag,...,transplant,vaccin,ambulatory,community,hospital,mixed,pmdata,pubyear,text,eligible_impact
1442556,0,,,,,,,,,,...,,,,,,,,1992,Choosing condoms. Condoms for men are current...,0
34019631,0,,,,,,,,,,...,,,,,,,,2021,"When difficult conversations occur, will you b...",0
25068718,0,,,,,,,,,,...,,,,,,,,2014,Abstracts from the Academy of Managed Care Pha...,0
20652830,0,,,,,,,,,,...,,,,,,,,2010,Texas pharmacists' opinions on reporting serio...,0
17163274,c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2006,Drug-related problems in the community setting...,1


In [17]:
test_df.head(5)

Unnamed: 0,code_urpp,casecontrol,cohortpro,cohortretro,metaanalysis,prepost,quasirando,rando,sysreview,anticoag,...,transplant,vaccin,ambulatory,community,hospital,mixed,pmdata,pubyear,text,eligible_impact
24856776,0,,,,,,,,,,...,,,,,,,,2014,European Cystic Fibrosis Society Standards of ...,0
29908686,0,,,,,,,,,,...,,,,,,,,2018,Advanced practice nursing: Nutrition Nurse Spe...,0
32203965,0,,,,,,,,,,...,,,,,,,,2020,Alterations of the HPA Axis Observed in Patien...,0
23140593,0,,,,,,,,,,...,,,,,,,,2012,Main challenges facing the pharmaceutical sect...,0
31157099,0,,,,,,,,,,...,,,,,,,,2019,eHealth and mHealth. Both electronic health (...,0


In [18]:
test_df.tail(5)

Unnamed: 0,code_urpp,casecontrol,cohortpro,cohortretro,metaanalysis,prepost,quasirando,rando,sysreview,anticoag,...,transplant,vaccin,ambulatory,community,hospital,mixed,pmdata,pubyear,text,eligible_impact
28891393,0,,,,,,,,,,...,,,,,,,,2018,Improving Ambulatory Care Resident Training: P...,0
32695426,0,,,,,,,,,,...,,,,,,,,2020,The impact of a multifaceted intervention to r...,0
30636511,c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2020,Implementation of an Advanced Pharmacy Practic...,1
16775903,0,,,,,,,,,,...,,,,,,,,2006,Healthcare professionals' perceptions of the b...,0
1882873,0,,,,,,,,,,...,,,,,,,,1991,Effect of pharmacist participation on a medica...,0


In [19]:
pharm_train_df.head(5)

Unnamed: 0,code_urpp,casecontrol,cohortpro,cohortretro,metaanalysis,prepost,quasirando,rando,sysreview,anticoag,...,transplant,vaccin,ambulatory,community,hospital,mixed,pmdata,pubyear,text,eligible_impact
32295736,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0,0,1,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,A systematic review of the role of pharmacists...,1
33817821,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,Impact of educational intervention by communit...,1
33880786,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0,0,1,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,Pharmacist-led interventions to reduce adverse...,1
33965357,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,Impact of community-pharmacist-led medication ...,1
33386509,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1,0,0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,The Role of Pharmacies in the HIV Prevention a...,1


In [20]:
pharm_train_df['pubyear'].value_counts()

2014    177
2018    161
2013    151
2019    107
2015    105
2012    102
2016     99
2020     94
2017     89
2011     78
2010     66
2021     61
2009     52
2007     48
2008     45
2004     39
2003     37
2005     35
2006     32
2001     30
2000     26
1999     17
2002     16
1996      9
1993      8
1998      7
1992      5
1997      5
1994      5
1990      4
1995      3
1991      3
1988      1
Name: pubyear, dtype: int64

In [21]:
pharm_test_df.head(5)

Unnamed: 0,code_urpp,casecontrol,cohortpro,cohortretro,metaanalysis,prepost,quasirando,rando,sysreview,anticoag,...,transplant,vaccin,ambulatory,community,hospital,mixed,pmdata,pubyear,text,eligible_impact
32819880,a,0.0,0.0,0.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,Economic impact of clinical pharmaceutical act...,1
33918990,a,0.0,0.0,0.0,0.0,0,0,0,1,0,...,0,0,0,1,0,0,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,Impact of Pharmacists' Interventions and Patie...,1
32107837,a,0.0,0.0,0.0,1.0,0,0,0,0,0,...,0,0,0,0,0,1,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2020,Effect of pharmacist-led interventions on medi...,1
32067555,a,0.0,0.0,0.0,0.0,0,0,0,1,0,...,0,0,0,0,0,1,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2021,Pharmacist Interventions in the Management of ...,1
32022107,a,0.0,0.0,0.0,1.0,0,0,0,0,0,...,0,0,0,0,0,1,"<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArticl...",2020,Pharmaceutical care-based interventions in typ...,1


Save

In [22]:
train_df.to_pickle(filepath + '/data/first_gen/train_df.pkl')
test_df.to_pickle(filepath + '/data/first_gen/test_df.pkl')
pharm_train_df.to_pickle(filepath + '/data/first_gen/pharm_train_df.pkl')
pharm_test_df.to_pickle(filepath + '/data/first_gen/pharm_test_df.pkl')