In [2]:
import pandas as pd

In [3]:
%cd ../data/covidence
from pathlib import Path
paths = list(Path('.').glob('*/full.csv'))
reviews = [path.parts[0] for path in paths]

/home/tom/msc/diss/experiments/data/covidence


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [25]:
from pathlib import Path

def describe_data():
    datasets = []
    for path in Path('.').glob('*/full.csv'):
        df = pd.read_csv(path)
        datasets.append({
            'review': path.parts[0],
            'items': len(df),
            'pos_items': df.label.sum(),
            'pos_frac': df.label.sum() / len(df),
        })
    return pd.DataFrame(datasets)

describe_data()

Unnamed: 0,review,items,pos_items,pos_frac
0,review_378562,643,98,0.152411
1,review_240084,599,130,0.217028
2,review_334317,4365,126,0.028866
3,review_287708,1360,256,0.188235
4,review_121733,3707,1373,0.37038
5,review_288055,1381,26,0.018827
6,review_165805,2292,239,0.104276
7,review_117787,1330,142,0.106767
8,review_258698,6976,213,0.030533


In [5]:
def update_review_criteria():
    df = pd.read_csv('reviews.csv', index_col=0)
    for path in Path('.').glob('*/standard_criteria.txt'):
        with path.open() as crit_file:
            criteria = crit_file.read()
        review = int(path.parts[0].split('_')[1])
        df.loc[review, 'criteria'] = criteria.strip()
    df.to_csv('reviews.csv')

update_review_criteria()


In [6]:
import fasttext
import numpy as np

from huggingface_hub import hf_hub_download

# monkey patch to remove annoying warning
fasttext.FastText.eprint = lambda x: None

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)
#out = model.predict(["this wasn't meant for you anyway", 'you know'], k=5)

def is_english(text, threshold=0.01, k=4):
    preds = model.predict(text, threshold=threshold, k=k)[0]
    return np.array([
        '__label__eng_Latn' in pred for pred in preds
    ])

In [7]:
def simplify_columns(df):
    df = df[['orig_index', 'Title', 'Abstract', 'label', 'review_id']]
    return df.rename(columns={'Title': 'title', 'Abstract': 'abstract'})

In [8]:
def load_papers(review_path):
    included = pd.read_csv(review_path + '/' + 'included.csv')
    included['label'] = 1
    excluded = pd.read_csv(review_path + '/' + 'excluded.csv')
    excluded['label'] = 1
    irrelevant = pd.read_csv(review_path + '/' + 'irrelevant.csv')
    irrelevant['label'] = 0
    for frame in (included, excluded, irrelevant):
        frame['orig_index'] = frame.index
    
    full_df = pd.concat([included, excluded, irrelevant])
    full_df = full_df.reset_index()
    return full_df

In [9]:
def find_duplicates(df):
    title_counts = df.groupby('Title').count()
    dup_titles = title_counts[title_counts.Authors > 1]
    return df[df.Title.isin(dup_titles.index)].sort_values('Title')

In [10]:
def remove_no_abstract(df):
    orig_len = len(df)
    filtered = df[(df.Abstract != '') & (~df.Abstract.isnull())]
    print(f'{orig_len - len(filtered)} rows removed')
    return filtered

In [11]:
def remove_not_english(df):
    orig_len = len(df)
    is_eng = is_english(df.Abstract.to_list())
    return df[is_eng], df[~is_eng]

### Cleaning each review dataset

In [20]:
df = load_papers('review_288055')
df = remove_no_abstract(df)
# gave lots of false positives, just remove one german one
# df, df_non_eng = remove_not_english(df)
df = df.drop(1540)
# one duplicate
# find_duplicates(df)
df = df.drop(1532)
df['review_id'] = 288055
simplify_columns(df).to_csv('review_288055/full.csv')

1057 rows removed


In [141]:
df = load_papers('review_258698')
df = remove_no_abstract(df)

# 466 non-eng!
#df, df_non_eng = remove_not_english(df)
#df_non_eng.to_csv('review_258698/non_eng.csv')

find_duplicates(df)
#dups_to_remove = [3017, 830, 2577, 3233, 4782, 4133]
df = df.drop(dups_to_remove)
df['review_id'] = 258698
simplify_columns(df).to_csv('review_258698/full.csv')

57 rows removed


Unnamed: 0,index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,label,orig_index,preproc_tag
3016,2794,Development of an immunochromatographic strip ...,"Ji, Fang; Mokoena, Mduduzi P.; Zhao, Hongyan; ...",A colloidal gold (ICS) test was developed for ...,2017.0,,PLoS ONE,12.0,5,,,https://dx.doi.org/10.1371/journal.pone.0175282,,#2898,Ji 2017,,,0,2794,
3017,2795,Development of an immunochromatographic strip ...,"Ji, F.; Mokoena, M. P.; Zhao, H.; Olaniran, A....",A colloidal gold (ICS) test was developed for ...,2019.0,,Toxicon,158.0,Supplement 1,,,https://dx.doi.org/10.1016/j.toxicon.2018.10.228,,#2899,Ji 2019,,,0,2795,
829,607,Epidemiology of gout and hyperuricemia in New ...,"Bardin, Thomas; Magnat, Elodie; Clerson, Pierr...",Objectives: New Caledonia is a Pacific island ...,2021.0,,Joint Bone Spine,88.0,5,,,10.1016/j.jbspin.2021.105286,,#632,Bardin 2021,,,0,607,
830,608,Epidemiology of gout and hyperuricemia in New ...,"Bardin, Thomas; Magnat, Elodie; Clerson, Pierr...",OBJECTIVES: New Caledonia is a Pacific island ...,2022.0,,Joint Bone Spine,89.0,2,,,https://dx.doi.org/10.1016/j.jbspin.2021.105286,,#633,Bardin 2022,,,0,608,
620,398,Experience of pediatric rapid response team in...,"Anwar ul, Haque; Saleem, Ali Faisal; Zaidi, Sa...",OBJECTIVE: To report our experience before and...,2010.0,,Indian Journal of Pediatrics,77.0,3,,,https://dx.doi.org/10.1007/s12098-010-0032-2,,#418,Anwarul 2010,,,0,398,
2577,2355,Experience of pediatric rapid response team in...,"Haque, A.",Introduction: The concept of rapid response te...,2009.0,,Critical Care Medicine,37.0,12 SUPPL.,,,https://dx.doi.org/10.1097/01.ccm.0000365439.1...,,#2438,Haque 2009,,,0,2355,
3232,3010,Medical and substance use comorbidity in bipol...,"Kemp, D. E.; Gao, K.; Ganocy, S. J.; Caldes, E...",Objective: National Comorbidity Survey data in...,2009.0,,Journal of Affective Disorders,116.0,1-2,,,10.1016/j.jad.2008.11.011,,#3122,Kemp 2009,,,0,3010,
3233,3011,Medical and substance use comorbidity in bipol...,"Kemp, D. E.; Gao, K.; Ganocy, S. J.; Caldes, E...",Objective: National Comorbidity Survey data in...,2010.0,,Psiquiatria Biologica,17.0,4,,,10.1016/j.psiq.2010.12.002,,#3123,Kemp 2010,,,0,3011,
4781,4559,Non-protective immunity against tetanus in pri...,"Orimadegun, Adebola Emmanuel; Orimadegun, Bose...",Introduction: Nigeria remains among the few co...,2008.0,,Pan Afr. med. j,,,,,,,#4731,Orimadegun 2008,,,0,4559,
4782,4560,Non-protective immunity against tetanus in pri...,"Orimadegun, Adebola Emmanuel; Orimadegun, Bose...",INTRODUCTION: Nigeria remains among the few co...,2017.0,,The Pan African medical journal,27.0,Suppl 3,,,https://dx.doi.org/10.11604/pamj.supp.2017.27....,,#4732,Orimadegun 2017,,,0,4560,


In [22]:
df = load_papers('review_287708')
df = remove_no_abstract(df)
# no dups
# find_duplicates(df)
df['review_id'] = 287708
simplify_columns(df).to_csv('review_287708/full.csv')

46 rows removed


In [23]:
df = load_papers('review_117787')
df = remove_no_abstract(df)

# none flagged as non-english
# df, df_non_eng = remove_not_english(df)

# one duplicate
#find_duplicates(df)
df = df.drop(543)

df['review_id'] = 117787
simplify_columns(df).to_csv('review_117787/full.csv')

169 rows removed


In [139]:
df = load_papers('review_378562')
df = remove_no_abstract(df)

# df, df_non_eng = remove_not_english(df)

# one duplicate
# find_duplicates(df)
df = df.drop(540)

df['review_id'] = 378562
simplify_columns(df).to_csv('review_378562/full.csv')

83 rows removed


In [154]:
df = load_papers('review_334317')
df = remove_no_abstract(df)

df, df_non_eng = remove_not_english(df)

find_duplicates(df)
dup_ids = [1439, 3638, 2378, 3698, 2313, 2268, 739, 2856, 3453, 3424, 3866, 3323, 3206, 3671, 1008,
           3926, 3947, 82, 3527, 2683, 2917, 2643, 3680, 3347]
df = df.drop(dup_ids)

df['review_id'] = 334317
simplify_columns(df).to_csv('review_334317/full.csv')

77 rows removed


In [32]:
df = load_papers('review_165805')
df = remove_no_abstract(df)

df, df_non_eng = remove_not_english(df)

dup_ids = [294, 1371, 1043, 537, 300, 2362, 95, 1074, 1923, 2001, 2091, 2616, 2202]
df = df.drop(dup_ids)

df['review_id'] = 165805
simplify_columns(df).to_csv('review_165805/full.csv')

338 rows removed


In [20]:
df = load_papers('review_240084')
df = remove_no_abstract(df)

# only one, which has abstract in 3 langs
#df, df_non_eng = remove_not_english(df)

df['review_id'] = 240084
simplify_columns(df).to_csv('review_240084/full.csv')

19 rows removed


### Create various splits of the datasets

Create cut down datasets for 0-shot evaluation -- we want to have as many examples of positive and negative as possible up to a reasonable threshold, let's say 300.

Note we are not shuffling!

In [21]:
reviews = ['review_240084']

for review in reviews:
    df = pd.read_csv(f'{review}/full.csv', index_col=0)
    pos = df[df.label == 1]
    neg = df[df.label == 0]
    if len(pos) > 300:
        pos = pos.sample(300)
    if len(neg) > 300:
        neg = neg.sample(300)

    df = pd.concat([pos, neg])
    df.to_csv(f'{review}/zero-shot-eval.csv')
    

In [23]:
def make_splits(review_dir):
    df = pd.read_csv(review_dir + '/full.csv', index_col=0)
    
    rng = np.random.default_rng(24)
    df = df.sample(frac=1)
    
    # first take the validation sample
    cutoff = np.ceil(len(df) * 0.15).astype(int)
    val_split = df.iloc[:cutoff]
    train_split_1 = df.iloc[cutoff:]
    
    pos_train_count = train_split_1.label.sum()
    train_split_2 = pd.concat([
        train_split_1[train_split_1.label == 1],
        train_split_1[train_split_1.label == 0].sample(pos_train_count),
    ])
    
    print(f'Validation set: {len(val_split)} items')
    print(f'Train split 1: {len(train_split_1)} items ({pos_train_count} positive)')
    print(f'Train split 2: {len(train_split_2)} items ({train_split_2.label.sum()} positive)')
    
    np.savetxt(review_dir + '/val_split.txt', val_split.index.to_numpy())
    np.savetxt(review_dir + '/natural_train_split.txt', train_split_1.index.to_numpy())
    np.savetxt(review_dir + '/balanced_train_split.txt', train_split_2.index.to_numpy())

In [84]:
def make_cross_val_splits(review_dir):
    df = pd.read_csv(review_dir + '/full.csv', index_col=0)
    
    rng = np.random.default_rng(24)
    df = df.sample(frac=1)

    val_split_size = int(0.2 * len(df))
    for split_num, split_label in enumerate('ABCDE'):

        val_idxs = df.iloc[split_num * val_split_size : (split_num + 1) * val_split_size].index
        val_fold = df.loc[val_idxs]
        unbal_train_fold = df.drop(index=val_idxs)
        num_pos_items = unbal_train_fold.label.sum()
        bal_train_fold = pd.concat([
            unbal_train_fold[unbal_train_fold.label == 1],
            unbal_train_fold[unbal_train_fold.label == 0].sample(num_pos_items)
        ])
        np.savetxt(review_dir + f'/sm-val-fold-{split_label}.txt', val_idxs)
        #np.savetxt(review_dir + f'/train-fold-{split_label}.txt', bal_train_fold.index)

        sm_train_fold = unbal_train_fold.sample(196)
        np.savetxt(review_dir + f'/sm-train-fold-{split_label}.txt', sm_train_fold.index)

In [85]:
make_cross_val_splits('review_117787')
make_cross_val_splits('review_121733')
make_cross_val_splits('review_165805')
make_cross_val_splits('review_258698')
make_cross_val_splits('review_287708')
make_cross_val_splits('review_334317')
make_cross_val_splits('review_378562')

In [58]:
def load_fold(name):
    df = pd.read_csv('review_121733/full.csv', index_col=0)
    idxs = np.genfromtxt('review_121733/' + name)
    return df.loc[idxs]

In [94]:
review = 121733
fold = 'A'

for review in (117787, 121733, 165805, 258698, 287708, 334317):
    for fold in 'ABCDE':
        df = pd.read_csv(f'review_{review}/full.csv', index_col=0)
        idx = np.genfromtxt(f'review_{review}/sm-train-fold-{fold}.txt')
        free_df = df.drop(index=idx)
        few_shot = pd.concat([
            free_df[free_df.label == 1].sample(25),
            free_df[free_df.label == 0].sample(25),
        ])
        unseen_val = free_df.drop(index=few_shot.index)
        np.savetxt(f'review_{review}/few-shot-fold-{fold}.txt', few_shot.index)
        np.savetxt(f'review_{review}/unseen-fold-{fold}.txt', unseen_val.index)


In [24]:
#make_splits('review_287708')
#make_splits('review_121733')
#make_splits('review_258698')
#make_splits('review_378562')
#make_splits('review_334317')
#make_splits('review_165805')
make_splits('review_240084')

Validation set: 90 items
Train split 1: 509 items (116 positive)
Train split 2: 232 items (116 positive)


In [7]:
# Make full sets that are not quite full sets for reviews that are really big
import numpy as np

df = pd.read_csv('review_121733/full.csv', index_col=0)    
df = df.sample(740)
np.savetxt('review_121733/trunc_full.txt', df.index.to_numpy())

df = pd.read_csv('review_258698/full.csv', index_col=0)    
df = df.sample(5200)
np.savetxt('review_258698/trunc_full.txt', df.index.to_numpy())

In [None]:
# Make "few-shot" sets for the case when we have a few labelled samples from the target review
import numpy as np

for review in (117787, 121733, 165805, 240084, 258698, 287708, 334317, 378562):
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0)
    few_shot_df = pd.concat([
        df[df.label == 1].sample(25),
        df[df.label == 0].sample(25),
    ]).sample(frac=1)
    few_shot_df.to_csv(f'review_{review}/few_shot.csv')

In [125]:
# Make natural few shot sets
import numpy as np

for review in (117787, 121733, 165805, 240084, 258698, 287708, 334317, 378562):
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0)
    few_shot_df = df.sample(50)
    pos_count = few_shot_df.label.sum()
    # we want at least 5 pos
    if pos_count >= 5:
        few_shot_df.to_csv(f'review_{review}/nat_few_shot.csv')
        continue
        
    pos_needed = 5 - pos_count
    few_shot_df = pd.concat([
        few_shot_df.iloc[:-pos_needed],
        df[df.label == 1].sample(pos_needed),
    ]).sample(frac=1)
    few_shot_df.to_csv(f'review_{review}/nat_few_shot.csv')

In [141]:
# retrospectively make complementary evaluation splits to the few shot splits
# for review in (117787, 121733, 165805, 240084, 258698, 287708, 334317, 378562):
for review in (121733, 258698):
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0)
    few_shot_df = pd.read_csv(f'review_{review}/few_shot.csv', index_col=0)
    remain = df.drop(index=few_shot_df.index)
    if review == 121733:
        remain = remain.sample(740)
    if review == 258698:
        remain = remain.sample(5200)
    np.savetxt(f'review_{review}/few_shot_complement.txt', remain.index)

In [142]:
# retrospectively make complementary datasets to the natural few shots
for review in (117787, 121733, 165805, 240084, 258698, 287708, 334317, 378562):
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0)
    few_shot_df = pd.read_csv(f'review_{review}/nat_few_shot.csv', index_col=0)
    remain = df.drop(index=few_shot_df.index)
    if review == 121733:
        remain = remain.sample(740)
    if review == 258698:
        remain = remain.sample(5200)
    np.savetxt(f'review_{review}/nat_few_shot_complement.txt', remain.index)

In [274]:
# make a smaller multi-review evaluation subset
sets = []
for review in (117787, 121733, 165805, 258698, 287708, 334317, 378562):
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0)
    df['df_index'] = df.index
    idx = np.genfromtxt(f'review_{review}/nat_few_shot_complement.txt')
    eval_set = df.loc[idx]
    subset = eval_set.sample(60)
    sets.append(subset)
    extra_pos = eval_set[eval_set.label == 1].sample(5)
    sets.append(extra_pos)
    
mr_subset = pd.concat(sets).sample(frac=1)
mr_subset.label.sum()

98

In [259]:
# Set of items to use for in-context examples

sets = []
reviews = np.array([117787, 121733, 165805, 258698, 287708, 334317, 378562])
for review in reviews:
    df = pd.read_csv(f'review_{review}/nat_few_shot.csv', index_col=0)
    sets.append(df)

all_fs_train = pd.concat(sets).sample(frac=1)
np.random.shuffle(reviews)
signs = [0, 1, 0, 1]
train_idxs = []
for rev_id, sign in zip(reviews[:4], signs):
    train_idxs.append(all_fs_train[
        (all_fs_train.review_id == rev_id)
        & (all_fs_train.label == sign)
    ].index[0])

train_idxs

examples_set = all_fs_train.loc[train_idxs]
examples_set

Unnamed: 0,orig_index,title,abstract,label,review_id
614,440,Disease burden of community acquired pneumonia...,Background: To obtain the baseline data on the...,0,117787
105,61,Perceptions of suicide and their impact on pol...,"In recent years, there has been an increase in...",1,334317
1706,1706,A stable filamentous coaxial microelectrode fo...,"By using a facile spray drying method, we fabr...",0,121733
118,106,Clinical characteristics and severity of COVID...,Resumen Introducción: Se ha señalado que facto...,1,165805


In [260]:
examples_set.to_csv('multi_review_examples.csv')

In [275]:
mr_subset[['review_id', 'df_index']].to_csv('multi_review_eval_subset.csv', index=False)

In [130]:
# the final test set
df = pd.read_csv('review_240084/full.csv', index_col=0)
few_shot_df = pd.read_csv('review_240084/nat_few_shot.csv', index_col=0)
remain = df.drop(index=few_shot_df.index)
np.savetxt(f'review_240084/nat_few_shot_complement.txt', remain.index)

In [131]:
few_shot_240 = pd.read_csv('review_240084/nat_few_shot.csv', index_col=0)

Unnamed: 0,orig_index,title,abstract,label,review_id
54,46,COVID-19 recovery from the community perspecti...,Background: People younger than 65 years livin...,1,240084
131,123,"Recovery, resilience and growth regimes under ...",This paper attempts to weigh into the debate o...,1,240084
320,184,Investigating the challenges and opportunities...,INTRODUCTION: Hospital admissions from COVID-1...,0,240084
252,116,COVID-19 impact on Surgical Training and Recov...,BACKGROUND: The COVID-19 pandemic has resulted...,0,240084
239,103,An Analytical Perspective on Pandemic Recovery,After implementing restrictions to curb the sp...,0,240084
285,149,A qualitative exploration of the impact of COV...,COVID-19 may have substantial impact on the me...,0,240084
596,460,Sustainable economic development in the Europe...,"The paper has two basic objectives. The first,...",0,240084
190,54,Headwinds across the Balkans: economic growth ...,This paper provides comparison of economic gro...,0,240084
206,70,One Piece of the Jigsaw for the Cancer Recover...,COVID-19 has forced governments to make drasti...,0,240084
473,337,Disaster Planning During SARS-CoV-2/COVID: One...,Disasters cause a major disruption to normal o...,0,240084


In [38]:
# split in half
for review in (117787, 121733, 165805, 240084, 258698, 287708, 288055, 334317, 378562):
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0)

    if review in (121733, 258698):
        trunc_split = np.genfromtxt(f'review_{review}/trunc_full.txt')
        df = df.loc[trunc_split]
    
    df_pos = df[df.label == 1].sample(frac=1)
    df_neg = df[df.label == 0].sample(frac=1)
    
    num_pos = len(df_pos) // 2
    num_neg = len(df_neg) // 2

    df_a = pd.concat([df_pos.iloc[:num_pos], df_neg.iloc[:num_neg]])
    df_b = pd.concat([df_pos.iloc[num_pos:], df_neg.iloc[num_neg:]])

    np.savetxt(f'review_{review}/first_half.txt', df_a.index)
    np.savetxt(f'review_{review}/second_half.txt', df_b.index)

In [33]:
idx_a = np.genfromtxt('review_258698/first_half.txt')
idx_b = np.genfromtxt('review_258698/second_half.txt')

In [35]:
len(idx_a), len(idx_b)

(3487, 3489)

### Review 121733

In [89]:
# excluded based on title and abstract
title_abs_excl = pd.read_csv('review_121733/review_121733_irrelevant-after-Title&Abstract-screen.csv')
# conflicting decisions based on title and abstract
conflicts = pd.read_csv('review_121733/review_121733_Conflicts-at-Title&Abstract-screen.csv')

## sets below made it past title and abstract screen
# excluded based on full text
full_text_excl = pd.read_csv('review_121733/review_121733_excluded-after-fulltext-screen.csv')
# full text review not complete
incomplete = pd.read_csv('review_121733/review_121733_fulltext-screen-incomplete.csv')
# included based on full text
included = pd.read_csv('review_121733/review_121733_included-after-fulltext-screen.csv')



In [90]:
from enum import Enum

class Decision(Enum):
    EXCL = 0
    CONFL = 1
    FT_EXCL = 2
    FT_INCOMP = 3
    FT_INCL = 4

In [91]:
title_abs_excl['decision'] = Decision.EXCL
conflicts['decision'] = Decision.CONFL
full_text_excl['decision'] = Decision.FT_EXCL
incomplete['decision'] = Decision.FT_INCOMP
included['decision'] = Decision.FT_INCL

frames = [title_abs_excl, conflicts, full_text_excl, incomplete, included]

In [92]:
full_df = pd.concat(frames)
full_df['orig_index'] = full_df.index
full_df = full_df.reset_index()
full_df

Unnamed: 0,index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,decision,orig_index
0,0,Simultaneous removal of tetracycline and disin...,"Zuo, S.; Zhang, Y.; Ren, G.; Pan, Y.; Zhang, Q...",Pharmaceutical and personal care products as o...,2019.0,,Journal of Hazardous Materials,368,,771-777,2001546732,http://dx.doi.org/10.1016/j.jhazmat.2019.02.005,5202.0,#1,Zuo 2019,,,Decision.EXCL,0
1,1,A practical method for the calculation of liqu...,"Zuend, A.; Seinfeld, J. H.",Liquid mixtures containing a variety of organi...,2013.0,,Fluid Phase Equilib.,337,,201-213,WOS:000315325500027,10.1016/j.fluid.2012.09.034,8644.0,#2,Zuend 2013,,,Decision.EXCL,1
2,2,Toward better microbial safety of wheat sprout...,"Zudyte, B.; Luksiene, Z.",Sprouted seeds are gaining popularity worldwid...,2019.0,,Photochem. Photobiol. Sci.,18,10,2521-2530,WOS:000489635100014,10.1039/c9pp00157c,8643.0,#3,Zudyte 2019,,,Decision.EXCL,2
3,3,Functional collaboration of biofilm-cathode el...,"Zou, H.; Wang, Y.",A distinctive process (BCE-MFC) was developed ...,2019.0,,Environmental science and pollution research i...,26,22,23061-23069,628271164,http://dx.doi.org/10.1007/s11356-019-05617-w,5201.0,#4,Zou 2019,,,Decision.EXCL,3
4,4,Continuous synthesis of graphene sheets by spr...,"Zou, B.; Wang, X. X.; Huang, X. X.; Wang, J. N.",Graphene sheets (GNS) were synthesized continu...,2015.0,,Chemical Communications,51,4,741-744,600732879,http://dx.doi.org/10.1039/c4cc08197h,5200.0,#5,Zou 2015,,,Decision.EXCL,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3842,91,In Vitro Antibacterial Activity of Hydrogen Pe...,"Raval, Y. S.; Flurin, L.; Mohamed, A.; Greenwo...",Hydrogen peroxide (H2O2) and hypochlorous acid...,2021.0,,Antimicrobial Agents and Chemotherapy,65,5,,,10.1128/aac.01966-20,,#3680,Raval 2021,,"Setting - Clinical, human",Decision.FT_INCL,91
3843,92,Inactivation of bacteria causing soft rot dise...,"Song, Hyeyeon; Lee, Jae Yung; Lee, Hae-Won; Ha...","Slightly acidic electrolyzed water (SAEW), an ...",2021.0,,Food Control,128,,,,10.1016/j.foodcont.2021.108217,,#3739,Song 2021,,Setting - Food processing,Decision.FT_INCL,92
3844,93,Application of Neutral Electrolyzed Water on p...,"Torres-Rosales, E.; Rivera-Garcia, A.; Rosario...",Physicochemical and microbiological properties...,2020.0,,Scientific Reports,10,1,,,https://dx.doi.org/10.1038/s41598-020-76931-4,,#3766,Torres-Rosales 2020,,Setting - Food processing,Decision.FT_INCL,93
3845,94,Electrochemically activated solution as bladde...,"Weyler, M.; Jaekel, A.; Kirschner-Hermanns, R....",AIMS: Driven by increasing awareness of antibi...,2021.0,,Neurourology & Urodynamics,40,7,,,https://dx.doi.org/10.1002/nau.24745,,#3799,Weyler 2021,,"Setting - Clinical, human",Decision.FT_INCL,94


#### Check for duplicates

In [121]:
title_counts = full_df.groupby('Title').count()
dup_titles = title_counts[title_counts.Authors > 1]

full_df[full_df.Title.isin(dup_titles.index)].sort_values('Title')

Unnamed: 0,index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,decision,orig_index
2601,14,Are reduced concentrations of chlorine-based d...,"Kumar, J.; Cadnum, J.; Wong, Y. K. N.; Mana, T...","Background. Currently, sporicidal disinfectant...",2019.0,,Open Forum Infectious Diseases,6 (Supplement 2),,S439,630694123,http://dx.doi.org/10.1093/ofid/ofz360.1085,4153.0,#2177,Kumar 2019,Exclusion reason: Abstract only;,Setting - Water/Environmental treatment,Decision.FT_EXCL,14
3685,1077,Are reduced concentrations of chlorine-based d...,"Kumar, J. A.; Cadnum, J. L.; Jencson, A. L.; D...",Chlorine-based disinfectants are commonly used...,2020.0,,American Journal of Infection Control,48,4.0,,,https://dx.doi.org/10.1016/j.ajic.2019.08.027,,#3539,Kumar 2020,Gillian Clayton (2022-09-29 22:37:25)(Screen):...,"Setting - Clinical, human",Decision.FT_INCOMP,1077
498,498,Automated endoscope reprocessors,"Petersen, B. T.; Adler, D. G.; Chand, B.; Conw...",National consensus standards provide guidance ...,2009.0,,Gastrointestinal Endoscopy,69,4.0,771-776,354342103,http://dx.doi.org/10.1016/j.gie.2008.11.037,4541.0,#765,Petersen 2009,,,Decision.EXCL,498
1665,1665,Automated endoscope reprocessors,"Desilets, D.; Kaul, V.; Tierney, W. M.; Banerj...",National consensus standards provide guidance ...,2010.0,,Gastrointestinal Endoscopy,72,4.0,675-680,359707018,http://dx.doi.org/10.1016/j.gie.2010.06.019,3682.0,#2625,Desilets 2010,,,Decision.EXCL,1665
3605,997,"Compositions, methods and uses for cleaning, d...","Alimi, Hojabr; Prasad, Sridhar Govinda; De, Su...",The present specification discloses a composit...,2019.0,,,,,-,BCI:BCI201900886519,,,#3283,Alimi 2019,,Setting - Water/Environmental treatment,Decision.FT_INCOMP,997
3606,998,"Compositions, methods and uses for cleaning, d...","Alimi, Hojabr; Prasad, Sridhar Govinda; De, Su...",The present specification discloses a composit...,2020.0,,,,,-,BCI:BCI202000762625,,,#3284,Alimi 2020,Kate Homyer (2022-09-24 23:53:42)(Select): htt...,Setting - Water/Environmental treatment,Decision.FT_INCOMP,998
3132,524,Concerns and strategies for wastewater treatme...,"Kataki, S.; Chatterjee, S.; Vairale, M. G.; Sh...",Along with outbreak of the pandemic COVID-19 c...,2021.0,,"Resources, Conservation and Recycling",164 (no pagination),,,2007752849,http://dx.doi.org/10.1016/j.resconrec.2020.105156,4079.0,#1820,Kataki 2021,,"Setting - Clinical, human",Decision.FT_INCOMP,524
3681,1073,Concerns and strategies for wastewater treatme...,"Kataki, Sampriti; Chatterjee, Soumya; Vairale,...",Along with outbreak of the pandemic COVID-19 c...,2021.0,,Resources Conservation and Recycling,164,,,,10.1016/j.resconrec.2020.105156,,#3519,Kataki 2021,,,Decision.FT_INCOMP,1073
1598,1598,Functionalization of water as a nonthermal app...,"Esua, O. J.; Cheng, J. H.; Sun, D. W.",Meat and seafood products present a viable med...,2020.0,,Critical reviews in food science and nutrition,,,1-19,631362300,http://dx.doi.org/10.1080/10408398.2020.1735297,3749.0,#2520,Esua 2020,,,Decision.EXCL,1598
2556,142,Functionalization of water as a nonthermal app...,"Esua, Okon Johnson; Cheng, Jun-Hu; Sun, Da-Wen",Meat and seafood products present a viable med...,2021.0,,Critical Reviews in Food Science and Nutrition,61,3.0,,,10.1080/10408398.2020.1735297,,#3428,Esua 2021,,,Decision.CONFL,142


In [29]:
# terms that come up in notes for papers that we may need to remove from the dataset
note_search_terms = [
    'english',
    'abstract',
    'full text',
    'full article',
    'no full',
    'accident',
    'mistake',
    'access',
    'uoe',
]

def is_important_note(note):
    if pd.isna(note):
        return False
    note = note.lower()
    for term in note_search_terms:
        if term in note:
            return True
    return False
    

In [68]:
full_df[full_df.Notes.apply(is_important_note)].to_csv('/tmp/decision_notes.csv')

In [63]:
# ... manually remove some lines from spreadsheet ...
to_review = pd.read_csv('review_121733/decision_notes_to_review.csv')
to_review

Unnamed: 0,index,orig_index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,decision
0,439,439,Disinfection of crockery and unen by using ele...,"Vorobev, M. I.; Shlykov, V. I.",The structure of an electrolysor for electroly...,1962,,Zhurnal Mikrobiologii Epidemiologii i Immunobi...,,,9-14,281516414,,4980.0,#677,Vorobev 1962,Gillian Clayton (2021-01-18 21:11:12)(Screen):...,,Decision.EXCL
1,443,443,Infection prevention and patient safety improv...,"Von Der Weid, D.",Introduction: The technology developed by Ante...,2015,,Antimicrobial Resistance and Infection Control...,4,SUPPL. 1,,72038880,,4978.0,#681,VonDerWeid 2015,Felicity Mehendale (2020-11-10 08:50:45)(Scree...,,Decision.EXCL
2,460,460,Sporicidal activity in diallyl disulphide oxid...,"Velazquez-Ramirez, C. I.; Scougall-Vilchis, R....",BACKGROUND: Ineffective instrument reprocessin...,2015,,American Journal of Infection Control,43 (6 Supplement 1),,S26-S27,614159542,,4963.0,#702,Velazquez-Ramirez 2015,Gillian Clayton (2020-12-09 03:42:45)(Screen):...,,Decision.EXCL
3,480,480,Antimicrobial efficacy of nine different root ...,"van der Vyver, P. J.; Botha, F. S.; de Wet, F. A.",,2014,,SADJ : journal of the South African Dental Ass...,69,4,"158-160, 162-165",373549858,,4948.0,#737,vanderVyver 2014,Gillian Clayton (2021-01-18 21:12:28)(Screen):...,,Decision.EXCL
4,528,528,Activity and efficacy of a stable chlorinated ...,"Pappalardo, G.; Tanner, F.; Roussianos, D.; Pa...",,1982,,Experientia,38,11,1373-1374,13170764,,4504.0,#820,Pappalardo 1982,Gillian Clayton (2021-01-18 21:09:58)(Screen):...,,Decision.EXCL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,3617,1009,Disinfection effect of neutral electrolyzed wa...,"Zhu, ZhiWei; Li, BaoMing; Li, YongYu; Shang, Y...","For the sake of developing an effective, safe ...",2010,,Transactions of the Chinese Society of Agricul...,26,3,358-362,20103199339,,,#3296,Zhu 2010,Kate Homyer (2022-09-30 22:14:27)(Select): Not...,,Decision.FT_INCOMP
70,3619,1011,A first study on the application of an electro...,"Zytur, Stellan; Bandte, Martina; Rodriguez, Ha...",,2016,,60 German Plant Protection Conference - Short ...,454,,481-,BCI:BCI201700088244,,,#3298,Zytur 2016,Kate Homyer (2022-10-03 23:16:29)(Select): can...,,Decision.FT_INCOMP
71,3663,1055,A comparative experiment on the three kinds of...,"Hara, Norika; Fujita, Shoichi; Satake, Yuko; N...",Xanthomonas citri pv. citri and X. citri pv. a...,2020,,Research Bulletin of the Plant Protection Serv...,,56,,,,,#3465,Hara 2020,Kate Homyer (2022-12-05 22:48:23)(Select): no ...,,Decision.FT_INCOMP
72,3701,1093,A Concept for the Reduction of Mucosal SARS-Co...,"Mueller, C. A.; Winter, M.; Renner, B.","During the next few months or years, vaccinati...",2021,,Drug Research,71,6,,,https://dx.doi.org/10.1055/a-1467-5956,,#3621,Mueller 2021,Kate Homyer (2022-12-06 01:29:45)(Select): Not...,,Decision.FT_INCOMP


In [30]:
# No abstract
len_before = len(full_df)
full_df = full_df[~full_df.Abstract.isnull()]
print(f'{len_before - len(full_df)} rows removed due to no abstract')

126 rows removed due to no abstract


In [31]:
# Remove papers which seem to have been excluded purely due to lack of full text rather
# than based on the content of the title/abstract.
# Most papers marked as missing full text are FT_INCOMP - these are OK to keep.
to_remove = [443, 460, 2035, 2107, 2590, 2596, 2597, 2598, 2600, 2601, 2606]
len_before = len(full_df)
full_df = full_df.drop(to_remove)
print(f'{len_before - len(full_df)} rows removed - exclusion due to missing full text')

11 rows removed - exclusion due to missing full text


In [32]:
# Remove papers removed due to non-English text (also some FT_INCOMPL where the abstract
# was English and approved but full text not English -- these we leave)
to_remove = [439, 664, 2185]
len_before = len(full_df)
full_df = full_df.drop(to_remove)
print(f'{len_before - len(full_df)} rows removed - exclusion due to full text not English')

3 rows removed - exclusion due to full text not English


In [67]:
full_df[full_df.Notes.apply(is_important_note)]

Unnamed: 0,index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,decision,orig_index
72,72,Effectiveness of individual or combined saniti...,"Yuk, H. G.; Bartz, J. A.; Schneider, K. R.","Unwaxed, green tomatoes ('Florida 47' cultivar...",2005.0,,Journal of Food Science,70,9,M409-M414,20063008118,http://dx.doi.org/10.1111/j.1365-2621.2005.tb0...,10236.0,#117,Yuk 2005,Felicity Mehendale (2020-11-10 06:49:20)(Scree...,,Decision.EXCL,72
187,187,Two-year follow-up study of the effect of acid...,"Tanaka, H.; Honma, S.; Nishi, M.; Igarashi, T....",Acid fog is a complex mixture of atomospheric ...,1996.0,,Intern. Med.,35,2,100-104,WOS:A1996UA20200005,10.2169/internalmedicine.35.100,8339.0,#293,Tanaka 1996,Gillian Clayton (2021-01-08 03:13:55)(Screen):...,,Decision.EXCL,187
615,615,Mechanisms of ultraviolet disinfection and chl...,"Xu, L. M.; Zhang, C. M.; Xu, P. C.; Wang, X. C...",Traditional culture methods may underestimate ...,2018.0,,J. Environ. Sci.,65,,356-366,WOS:000427600600035,10.1016/j.jes.2017.07.006,8529.0,#969,Xu 2018,Felicity Mehendale (2020-11-29 08:18:28)(Scree...,,Decision.EXCL,615
1201,1201,The dimensional stability of dental impression...,"Martin, N.; Martin, M. V.; Jedynakiewicz, N. M.",OBJECTIVES: This investigation examined the ef...,2007.0,,Dental materials : official publication of the...,23,6,760-768,47231825,http://dx.doi.org/10.1016/j.dental.2007.01.004,4323.0,#1873,Martin 2007,Kayla Ostrishko (2020-11-14 11:54:19)(Screen):...,,Decision.EXCL,1201
2515,101,Light up ClO(-) in live cells using an aza-cou...,"Fan, J.; Mu, H.; Zhu, H.; Wang, J.; Peng, X.",Hypochlorous acid (HClO)/hypochlorite (ClO(-))...,2015.0,,The Analyst,140,13,4594-4598,608359278,http://dx.doi.org/10.1039/c5an00777a,3753.0,#2508,Fan 2015,Kate Homyer (2022-05-25 23:26:27)(Screen): Sou...,,Decision.CONFL,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,1027,Comparison of disinfectants for drinking water...,"Choi, Y.; Byun, S. H.; Jang, H. J.; Kim, S. E....",The feasibility of on-site generated chlorine ...,2022.0,,Environmental Engineering Research,27,1,,,10.4491/eer.2020.543,,#3381,Choi 2022,Gillian Clayton (2022-09-29 23:14:08)(Screen):...,,Decision.FT_INCOMP,1027
3650,1042,Electrolyzed Water Generated On-Site as a Prom...,"Farah, R. I.; Al-Haj Ali, S. N.","Electrolyzed water is a safe, broad-spectrum b...",2021.0,,Frontiers in Public Health,9,,,,https://dx.doi.org/10.3389/fpubh.2021.629142,,#3434,Farah 2021,Gillian Clayton (2022-09-29 23:48:18)(Screen):...,,Decision.FT_INCOMP,1042
3663,1055,A comparative experiment on the three kinds of...,"Hara, Norika; Fujita, Shoichi; Satake, Yuko; N...",Xanthomonas citri pv. citri and X. citri pv. a...,2020.0,,Research Bulletin of the Plant Protection Serv...,,56,,,,,#3465,Hara 2020,Kate Homyer (2022-12-05 22:48:23)(Select): no ...,,Decision.FT_INCOMP,1055
3701,1093,A Concept for the Reduction of Mucosal SARS-Co...,"Mueller, C. A.; Winter, M.; Renner, B.","During the next few months or years, vaccinati...",2021.0,,Drug Research,71,6,,,https://dx.doi.org/10.1055/a-1467-5956,,#3621,Mueller 2021,Kate Homyer (2022-12-06 01:29:45)(Select): Not...,,Decision.FT_INCOMP,1093


In [34]:
# label as positive everything where at least one reviewer thought it should
# get beyond abstract screening
full_df['label'] = full_df.decision.apply(lambda v: 1 if v.value > 0 else 0)

Unnamed: 0,index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,decision,orig_index,label
0,0,Simultaneous removal of tetracycline and disin...,"Zuo, S.; Zhang, Y.; Ren, G.; Pan, Y.; Zhang, Q...",Pharmaceutical and personal care products as o...,2019.0,,Journal of Hazardous Materials,368,,771-777,2001546732,http://dx.doi.org/10.1016/j.jhazmat.2019.02.005,5202.0,#1,Zuo 2019,,,Decision.EXCL,0,0
1,1,A practical method for the calculation of liqu...,"Zuend, A.; Seinfeld, J. H.",Liquid mixtures containing a variety of organi...,2013.0,,Fluid Phase Equilib.,337,,201-213,WOS:000315325500027,10.1016/j.fluid.2012.09.034,8644.0,#2,Zuend 2013,,,Decision.EXCL,1,0
2,2,Toward better microbial safety of wheat sprout...,"Zudyte, B.; Luksiene, Z.",Sprouted seeds are gaining popularity worldwid...,2019.0,,Photochem. Photobiol. Sci.,18,10,2521-2530,WOS:000489635100014,10.1039/c9pp00157c,8643.0,#3,Zudyte 2019,,,Decision.EXCL,2,0
3,3,Functional collaboration of biofilm-cathode el...,"Zou, H.; Wang, Y.",A distinctive process (BCE-MFC) was developed ...,2019.0,,Environmental science and pollution research i...,26,22,23061-23069,628271164,http://dx.doi.org/10.1007/s11356-019-05617-w,5201.0,#4,Zou 2019,,,Decision.EXCL,3,0
4,4,Continuous synthesis of graphene sheets by spr...,"Zou, B.; Wang, X. X.; Huang, X. X.; Wang, J. N.",Graphene sheets (GNS) were synthesized continu...,2015.0,,Chemical Communications,51,4,741-744,600732879,http://dx.doi.org/10.1039/c4cc08197h,5200.0,#5,Zou 2015,,,Decision.EXCL,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3842,91,In Vitro Antibacterial Activity of Hydrogen Pe...,"Raval, Y. S.; Flurin, L.; Mohamed, A.; Greenwo...",Hydrogen peroxide (H2O2) and hypochlorous acid...,2021.0,,Antimicrobial Agents and Chemotherapy,65,5,,,10.1128/aac.01966-20,,#3680,Raval 2021,,"Setting - Clinical, human",Decision.FT_INCL,91,1
3843,92,Inactivation of bacteria causing soft rot dise...,"Song, Hyeyeon; Lee, Jae Yung; Lee, Hae-Won; Ha...","Slightly acidic electrolyzed water (SAEW), an ...",2021.0,,Food Control,128,,,,10.1016/j.foodcont.2021.108217,,#3739,Song 2021,,Setting - Food processing,Decision.FT_INCL,92,1
3844,93,Application of Neutral Electrolyzed Water on p...,"Torres-Rosales, E.; Rivera-Garcia, A.; Rosario...",Physicochemical and microbiological properties...,2020.0,,Scientific Reports,10,1,,,https://dx.doi.org/10.1038/s41598-020-76931-4,,#3766,Torres-Rosales 2020,,Setting - Food processing,Decision.FT_INCL,93,1
3845,94,Electrochemically activated solution as bladde...,"Weyler, M.; Jaekel, A.; Kirschner-Hermanns, R....",AIMS: Driven by increasing awareness of antibi...,2021.0,,Neurourology & Urodynamics,40,7,,,https://dx.doi.org/10.1002/nau.24745,,#3799,Weyler 2021,,"Setting - Clinical, human",Decision.FT_INCL,94,1


In [35]:
# so we can distinguish this when we get other reviews
full_df['review_id'] = 121733

In [36]:
len(full_df), full_df.label.sum()

(3707, 1373)

In [37]:
simplify_columns(full_df).to_csv('review_121733/full.csv')

In [107]:
# create test/val/train splits
import numpy as np

simple_df = full_df[['Title', 'Abstract', 'label', 'review_id']].rename(columns={
    'Title': 'title',
    'Abstract': 'abstract',
})

rng = np.random.default_rng(24)
shuffled_df = simple_df.sample(frac=1)

splits = (0.10, 0.10, 0.8)
thresholds = [np.ceil(sum(splits[:i+1]) * len(shuffled_df)).astype(int) for i in range(len(splits))]
test_df = shuffled_df[:thresholds[0]]
val_df = shuffled_df[thresholds[0]:thresholds[1]]
train_df = shuffled_df[thresholds[1]:thresholds[2]]

len(test_df), len(val_df), len(train_df)

(371, 371, 2965)

### Review 258698

In this review all excluded papers have a reason for exclusion in the 'Notes' section. However, it seems to be something that's been selected out of a narrow range of generic options and not always accurate, e.g. #412 in excluded says 'wrong type of evidence' but also has a manual note saying it's abstract only and the same research as #411, which is included

Only small number of rows that say excluded due to no full paper and the like; have excluded by adding a preproc_tag column with value 'remove'.

There's only about 200 papers in this set - feels too small to do train/dev/test splits.

In [2]:
import pandas as pd

In [24]:
included = pd.read_csv('review_258698/review_258698_included_csv_20240427023944.csv')
excluded = pd.read_csv('review_258698/review_258698_excluded_csv_20240427023950.csv')

included['label'] = 1
excluded['label'] = 0

full_df = pd.concat([excluded, included])
full_df['orig_index'] = full_df.index
full_df = full_df.reset_index()
full_df = full_df[full_df.preproc_tag != 'remove']

full_df = full_df[~full_df.Abstract.isnull()]
full_df

Unnamed: 0,index,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,preproc_tag,label,orig_index
0,0,Rapid Ethical Assessment on Informed Consent C...,"Abay, Serebe; Addissie, Adamu; Davey, Gail; Fa...",BACKGROUND: Informed consent is a key componen...,2016,,PLoS ONE,11.0,6,,,https://dx.doi.org/10.1371/journal.pone.0157056,,#43,Abay 2016,Exclusion reason: Wrong test format;,"Interesting-Non-empirical (review article, com...",,0,0
1,1,Management of type 1 diabetes in low- and midd...,"Abdraimova, Aida; Besancon, Stephane; Portocar...",AIMS: To describe and compare the health syste...,2022,,Diabetic medicine : a journal of the British D...,39.0,8,,,https://dx.doi.org/10.1111/dme.14891,,#64,Abdraimova 2022,Exclusion reason: Wrong study design;,,,0,1
2,2,Pregnant women's experiences with an integrate...,"Abejirinde, I. O. O.; Douwes, R.; Bardaji, A.;...",Background: Quality antenatal care (ANC) is re...,2018,,BMC Pregnancy and Childbirth,18.0,209,,,http://dx.doi.org/10.1186/s12884-018-1853-7,,#71,Abejirinde 2018,Exclusion reason: Wrong study design;,,,0,2
3,3,Establishment of COVID-19 testing laboratory i...,"Abera, Adugna; Belay, Habtamu; Zewude, Aboma; ...",The Coronavirus pandemic is recording unpreced...,2020,,Global Health Action,13.0,1,,,10.1080/16549716.2020.1841963,,#73,Abera 2020,Exclusion reason: Wrong setting;,,,0,3
4,4,Malaria diagnosis and treatment behaviors amon...,"Aborigo, R. A.; Atuguba, F.; Chatio, S.; Adoct...","Globally, malaria control programmes are threa...",2011,,American Journal of Tropical Medicine and Hygiene,85.0,6 SUPPL. 1,,,,,#81,Aborigo 2011,Exclusion reason: Wrong type of evidence;,,,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,120,Caregiver experience and perceived acceptabili...,"Yee, WinLei; Than, KyuKyu; Mohamed, Y.; Htay, ...",Background: The majority of HIV infection amon...,2020,,PLoS ONE,15,10,,,http://dx.doi.org/10.1371/journal.pone.0241245,,#6809,Yee 2020,,,,1,120
218,121,Integrated point-of-care testing (POCT) for HI...,"Young, N.; Achieng, F.; Desai, M.; Phillips-Ho...","Background: HIV, syphilis, malaria and anaemia...",2019,,BMC Health Services Research,19,74,,,http://dx.doi.org/10.1186/s12913-018-3844-9,,#6846,Young 2019,,,,1,121
219,122,Decentralization of health care to HIV-AIDS fo...,"Zambenedetti, G.; da Silva, R. A. N.","In this article,we seek to identify and discus...",2016,,Physis,26,3,,,10.1590/S0103-73312016000300005,,#6902,Zambenedetti 2016,,,,1,122
220,123,Dengue rapid diagnostic tests: Health professi...,"Zongo, S.; Carabali, M.; Munoz, M.; Ridde, V.",Objectives: Dengue fever remains unrecognized ...,2018,,SAGE Open Medicine,6,,,,https://dx.doi.org/10.1177/2050312118794589,,#7044,Zongo 2018,,,,1,123


In [10]:
full_df.Notes.unique()

array(['Exclusion reason: Wrong test format; ',
       'Exclusion reason: Wrong study design; ',
       'Exclusion reason: Wrong setting; ',
       'Exclusion reason: Wrong type of evidence; ',
       'Exclusion reason: Wrong phenomena of interest; ',
       'Exclusion reason: Wrong perspective; ',
       'Exclusion reason: Wrong environment; ',
       'Exclusion reason: Wrong type of evidence; Janet Perkins (2022-10-28 17:54:14)(Select): Conference proceedings; ',
       'Exclusion reason: Wrong environment; Jasmin Rostron (2022-10-28 21:24:00)(Select): interesting; ',
       nan], dtype=object)

In [12]:
title_counts = full_df.groupby('Title').count()
dup_titles = title_counts[title_counts.Authors > 1]
dup_titles

Unnamed: 0_level_0,index,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,preproc_tag,label,orig_index
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [25]:
full_df['review_id'] = 258698

# create test/val/train splits
import numpy as np

simple_df = full_df[['Title', 'Abstract', 'label', 'review_id']].rename(columns={
    'Title': 'title',
    'Abstract': 'abstract',
})

rng = np.random.default_rng(24)
shuffled_df = simple_df.sample(frac=1)

splits = (0.10, 0.10, 0.8)
thresholds = [np.ceil(sum(splits[:i+1]) * len(shuffled_df)).astype(int) for i in range(len(splits))]
test_df = shuffled_df[:thresholds[0]]
val_df = shuffled_df[thresholds[0]:thresholds[1]]
train_df = shuffled_df[thresholds[1]:thresholds[2]]

len(test_df), len(val_df), len(train_df)

(21, 21, 168)

In [26]:
simple_df.to_csv('review_258698/full.csv')

In [38]:
eval = pd.read_csv('review_334317/zero-shot-eval.csv', index_col=0)
full = pd.read_csv('review_334317/full.csv', index_col=0)
free_idxs = set(full.index) - set(eval.index)
egs_df = full.loc[list(free_idxs)]
pos = egs_df[egs_df.label == 1].sample(5)
neg = egs_df[egs_df.label == 0].sample(5)

ValueError: a must be greater than 0 unless no samples are taken

### Multi-review datasets

In [96]:
def make_multi_review_df(reviews, min_pos_sample_size=98, max_pos_sample_size=None, balance=True):
    frames = []
    for path in Path('.').glob('*/full.csv'):        
        df = pd.read_csv(path, index_col=0)
        rev_id = df.iloc[0].review_id
        if not rev_id in reviews:
            continue
        num_pos = df.label.sum()
        if min_pos_sample_size > num_pos:
            raise Exception(f'Not enough positive examples in {path}')
        if max_pos_sample_size and max_pos_sample_size < num_pos:
            sample_num = max_pos_sample_size
        elif balance:
            sample_num = min_pos_sample_size
        else:
            sample_num = num_pos
        df = pd.concat([
            df[df.label == 1].sample(sample_num),
            df[df.label == 0].sample(sample_num),
        ])
        df['df_index'] = df.index
        df = df.sample(frac=1, ignore_index=True)
        frames.append(df)

    if balance:
        dataset = pd.concat(frames).sort_index()
    else:
        dataset = pd.concat(frames).sample(frac=1, ignore_index=True)
    return dataset

In [54]:
import re
import numpy as np

def make_multi_review_val_df(reviews, review_sample_size=97):
    frames = []
    for val_split_path in Path('.').glob('*/val_split.txt'):
        match = re.search(r'review_(\d{6})', str(val_split_path))
        rev_id = int(match.groups()[0])
        if not rev_id in reviews:
            continue
            
        df_path = val_split_path.parent / 'full.csv'
        df = pd.read_csv(df_path, index_col=0)
        df['df_index'] = df.index
        val_split = np.genfromtxt(val_split_path)
        df = df.loc[val_split].sample(review_sample_size, ignore_index=True)
        frames.append(df)
        
    dataset = pd.concat(frames).sort_index()
    return dataset

In [102]:
df = make_multi_review_df([117787, 121733, 165805, 258698, 287708, 334317, 378562])
path = f'multi-review-all.csv'
df[['review_id', 'df_index']].to_csv(path, index=False)

In [19]:
reviews = {117787, 121733, 165805, 258698, 287708, 334317, 378562}
for excl_review in reviews:
    incl_reviews = reviews - {excl_review}
    df = make_multi_review_df(incl_reviews)
    path = f'multi-review-excl-{excl_review}.csv'
    df[['review_id', 'df_index']].to_csv(path, index=False)

In [117]:
unbal = make_multi_review_df([121733, 165805, 258698, 287708, 334317, 378562], max_pos_sample_size=500, balance=False)
unbal[['review_id', 'df_index']].to_csv('multi-review-unbal-excl-117787.csv', index=False)

In [118]:
df = pd.read_csv('multi-review-unbal-excl-117787.csv')

In [27]:
# adhoc set to compare performance on 378562
incl_reviews = [258698, 165805, 287708]
df = make_multi_review_df([258698, 165805, 287708], min_pos_sample_size=196)
set_id = '-'.join([str(rev) for rev in incl_reviews])
path = f'three-review-Z-{set_id}.csv'
df[['review_id', 'df_index']].to_csv(path, index=False)

In [55]:
reviews = {117787, 121733, 165805, 258698, 287708, 334317, 378562}
top_four = [121733, 165805, 258698, 287708]
letters = 'ABCDEFG'
for i, eval_review in enumerate(reviews):
    incl_reviews = [rev for rev in top_four if rev != eval_review][:3]
    df = make_multi_review_df(incl_reviews, min_pos_sample_size=196)
    set_id = '-'.join([str(rev) for rev in incl_reviews])
    path = f'three-review-{letters[i]}-{set_id}.csv'
    df[['review_id', 'df_index']].to_csv(path, index=False)
    print('Saved', path)

    val_reviews = reviews - set(incl_reviews)
    val_df = make_multi_review_val_df(val_reviews)
    set_id = '-'.join([str(rev) for rev in val_reviews])
    path = f'val-split-{letters[i]}-{set_id}.csv'
    val_df[['review_id', 'df_index']].to_csv(path, index=False)
    print('Saved', path)


Saved three-review-A-121733-165805-258698.csv
Saved val-split-A-378562-117787-287708-334317.csv
Saved three-review-B-165805-258698-287708.csv
Saved val-split-B-334317-378562-117787-121733.csv
Saved three-review-C-121733-165805-287708.csv
Saved val-split-C-378562-117787-334317-258698.csv
Saved three-review-D-121733-165805-258698.csv
Saved val-split-D-378562-117787-287708-334317.csv
Saved three-review-E-121733-165805-258698.csv
Saved val-split-E-378562-117787-287708-334317.csv
Saved three-review-F-121733-165805-258698.csv
Saved val-split-F-378562-117787-287708-334317.csv
Saved three-review-G-121733-258698-287708.csv
Saved val-split-G-378562-117787-334317-165805.csv


In [57]:
for i, eval_review in enumerate(reviews):
    incl_reviews = [rev for rev in top_four if rev != eval_review][:3]
    print('Review', eval_review)
    print('Set', letters[i])
    print('-'.join([str(rev) for rev in incl_reviews]))
    print()
    i += 1

Review 378562
Set A
121733-165805-258698

Review 121733
Set B
165805-258698-287708

Review 258698
Set C
121733-165805-287708

Review 117787
Set D
121733-165805-258698

Review 287708
Set E
121733-165805-258698

Review 334317
Set F
121733-165805-258698

Review 165805
Set G
121733-258698-287708



In [45]:
from itertools import combinations

reviews = [117787, 121733, 165805, 258698, 287708, 334317, 378562]

for incl_reviews in combinations(reviews, 4):
    set_id = '-'.join([str(rev) for rev in incl_reviews])
    df = make_multi_review_val_df(incl_reviews)
    path = f'val_split-{set_id}.csv'
    df[['review_id', 'df_index']].to_csv(path, index=False)
    print('Saved', path)


Saved val_split-117787-121733-165805-258698.csv
Saved val_split-117787-121733-165805-287708.csv
Saved val_split-117787-121733-165805-334317.csv
Saved val_split-117787-121733-165805-378562.csv
Saved val_split-117787-121733-258698-287708.csv
Saved val_split-117787-121733-258698-334317.csv
Saved val_split-117787-121733-258698-378562.csv
Saved val_split-117787-121733-287708-334317.csv
Saved val_split-117787-121733-287708-378562.csv
Saved val_split-117787-121733-334317-378562.csv
Saved val_split-117787-165805-258698-287708.csv
Saved val_split-117787-165805-258698-334317.csv
Saved val_split-117787-165805-258698-378562.csv
Saved val_split-117787-165805-287708-334317.csv
Saved val_split-117787-165805-287708-378562.csv
Saved val_split-117787-165805-334317-378562.csv
Saved val_split-117787-258698-287708-334317.csv
Saved val_split-117787-258698-287708-378562.csv
Saved val_split-117787-258698-334317-378562.csv
Saved val_split-117787-287708-334317-378562.csv
Saved val_split-121733-165805-258698-287

In [14]:
full_df = pd.concat([
    pd.read_csv(path, index_col=0)
    for path in Path('.').glob('*/full.csv')
])

In [15]:
full_df = full_df.set_index([full_df.index, full_df.review_id]) 

In [51]:
split_df = pd.read_csv('val-split-A-378562-117787-287708-334317.csv')
indexer = [(row.df_index, row.review_id) for _, row in split_df.iterrows()]
full_df.loc[indexer]

KeyError: '[(0, 334317), (8, 334317), (10, 117787), (16, 117787), (17, 117787), (25, 287708), (26, 287708), (26, 334317), (30, 334317), (35, 334317), (39, 117787), (39, 287708), (43, 334317), (44, 378562), (45, 378562), (46, 378562), (47, 378562), (48, 378562), (49, 378562), (57, 117787), (66, 378562), (67, 378562), (73, 378562), (77, 117787), (78, 378562), (81, 378562), (82, 334317), (88, 378562), (92, 117787), (93, 378562), (95, 378562), (96, 378562)] not in index'

In [115]:
split_df = pd.read_csv('multi-review-excl-117787.csv')
indexer = [(row.df_index, row.review_id) for _, row in split_df.iterrows()]
full_df.loc[indexer]

Unnamed: 0_level_0,Unnamed: 1_level_0,orig_index,title,abstract,label,review_id
Unnamed: 0_level_1,review_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
371,378562,256,Hidden in Plain Sight: “Neutral” Enclosures fo...,"Politicians and mainstream media in the EU, UK...",0,378562
3165,258698,2943,The successful containment of COVID-19 outbrea...,Objectives: A three-member central Rapid Respo...,0,258698
1110,334317,964,How Can I Feel Safe at Home? Adolescents' Expe...,Despite the implementation of various national...,0,334317
286,165805,274,Relationship Between Bronchial Asthma and COVI...,<h2>Condition:</h2>Asthma;COVID-19<br><br><h2>...,1,165805
330,121733,330,Amide Link Scission in the Polyamide Active La...,The volume-averaged amide link scission in the...,0,121733
...,...,...,...,...,...,...
1556,121733,1556,Electrical methods of controlling bacterial ad...,This review will summarize the significant bod...,0,121733
3629,334317,3483,Characteristics of intentional acute drug into...,Background/Objectives: The purpose of this stu...,0,334317
57,378562,24,Evidence review for multicomponent interventio...,The UK has a routine vaccination schedule cove...,1,378562
166,165805,154,Underlying Medical Conditions Associated With ...,Importance: Information on underlying conditio...,1,165805


### Augmented datasets

In [92]:
full_df = pd.read_csv('review_378562/full.csv', index_col=0)
train_split = np.genfromtxt('review_378562/natural_train_split.txt')
df = full_df.loc[train_split]
reviews = pd.read_csv('reviews.csv', index_col='index')
reviews

# targeting 378562 as train and 334317 as eval
contrast_reviews = [121733, 117787, 165805, 258698, 287708] 
df_jumbled = df[df.label == 1].copy()
df_jumbled['review_id'] = np.random.choice(contrast_reviews, size=len(df_jumbled))
df_jumbled['label'] = 0

total_df = pd.concat([df, df_jumbled], ignore_index=True)
total_df = total_df.sample(frac=1)

total_df.to_csv('aug_cc_nat_378562.csv')

In [93]:
pd.read_csv('aug_cc_nat_378562.csv', index_col=0)

Unnamed: 0,orig_index,title,abstract,label,review_id
244,517,Spectrality and Thanatic Ethics of Care in Atl...,Although the gendered response to moral proble...,0,378562
394,199,"EVERYBODY HATES A TOURIST: WORLD-TRAVELING, EP...","Prior to the pandemic of 2020, global tourism ...",0,378562
456,174,The effect of mitigation strategies on univers...,"Rising levels of anxiety, stress and depressio...",0,378562
113,490,Rewriting the Transnational Dimension of Itali...,This article investigates how COVID-19 has rew...,0,378562
500,109,Counter-urbanisation in pre-pandemic times: di...,The COVID-19 pandemic has stimulated a resurge...,0,378562
...,...,...,...,...,...
285,278,IBIA 2021: A Global Conference and Meeting of ...,The proceedings contain 258 papers. The topics...,0,378562
541,457,Public health community engagement with Asian ...,OBJECTIVES: COVID-19 has posed significant cha...,0,378562
277,566,Unloved but Indispensable Logistics: How to Im...,[...]people should appreciate the role and the...,0,378562
419,314,Inspecting Prisons during a pandemic,"An interview with Peter Clarke, HM Chief Inspe...",0,378562


In [15]:
full_df = pd.read_csv('review_287708/full.csv', index_col=0)
train_split = np.genfromtxt('review_287708/balanced_train_split.txt')
df = full_df.loc[train_split]
reviews = pd.read_csv('reviews.csv', index_col='index')
reviews

# targeting 287708 as train and 288055 as eval
contrast_reviews = [121733, 117787, 165805, 258698, 334317] 
df_jumbled = df[df.label == 1].copy()
df_jumbled['review_id'] = np.random.choice(contrast_reviews, size=len(df_jumbled))
df_jumbled['label'] = 0

total_df = pd.concat([df, df_jumbled])
#total_df = total_df.sample(frac=1)

#total_df.to_csv('aug_cc_287708.csv')

In [16]:
all_reviews = {378562, 334317, 117787, 258698, 165805, 287708, 121733}

for review in all_reviews:
    rng = np.random.default_rng()
    train_split = np.genfromtxt(f'review_{review}/balanced_train_split.txt')
    rng.shuffle(train_split)
    df = pd.read_csv(f'review_{review}/full.csv', index_col=0).loc[train_split]
    new_rows = []
    for _, row in df.iterrows():
        new_rows.append(row)
        if row.label == 0:
            continue
        aug_row = row.copy()
        aug_row.review_id = rng.choice(contrast_reviews)
        aug_row.label = 0
        new_rows.append(aug_row)

    df = pd.DataFrame(new_rows)
    df = df.reset_index()
    df.rename(columns={'index': 'split_index'}, inplace=True)
    df = df.sample(frac=1)
    if len(df) > 1500:
        df = df[:1500]
    df.to_csv(f'review_{review}/aug_cc.csv')
