In [None]:
#!pip install snorkel
#!pip install dask distributed --upgrade
#!python -m spacy download en_core_web_md

In [1]:
from tqdm import tqdm
import pandas as pd 
import os
from snorkel.labeling import PandasLFApplier,LFAnalysis,LabelingFunction
from snorkel.labeling.model.label_model import LabelModel
from snorkel.labeling.apply.dask import PandasParallelLFApplier
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import time
import datetime
from datetime import datetime, date, time
from nltk import ngrams

In [15]:
#path information
task='task1'# or 'task2' # specify task
root_path='./'
#root_path='/repo1/code/autoreview/'
data_path=root_path+'data/'+task+'/'#path to save retrieved articles abstract
keyword_path=data_path+'keywords/'#keyword list
save_path=root_path+'results/'+task+'/'
literature_file_name='metadata_hypercoagulable.tsv'
pseudo_label_file_name='pseudo_label.pkl'

In [3]:
def build_sentence_df():
    """
    Read articles files (data_path+article_name) and save it as `sentences` a sentence-wise pd.dataframe
    Refined from `build_raw_data`
    
    output:
    """
    articles=pd.read_csv(data_path+literature_file_name, sep='\t').drop(columns='Unnamed: 0')
    
    sentences=articles['abstract'].apply(sent_tokenize)\
                    .apply(pd.Series)\
                    .merge(articles, left_index = True, right_index=True)\
                    .drop('abstract', axis=1)\
                    .melt(id_vars=['ncord_uid'], value_name='sent')\
                    .drop('variable', axis=1).dropna()
    
    return articles[['ncord_uid', 'abstract']], sentences


In [4]:
def load_keyword(keyword_file_name):
    """
    Read keyword list
    
    input:
        keyword_type_name: str, file name, e.g., `keylist.txt`
    output:
        list of string, e.g., ['keyword1', 'keyword2', 'keyword3']
        
    """
    with open(keyword_path+keyword_file_name, "r") as f:
        keylist=f.read().split(',')
    return keylist

In [22]:
def loop_labeling(col, keywordslist,viruslist,triallist,mustlist,maxngramnu):
    
    # keywordslist is the concept we want
    # viruslist is what type of virus we want to loop up
    # triallist is concept of trial we want to loop up
    # mustlist is concept of  we want the high ranking sentence should have
    # maxngramnu is the maxinum number of word window
    
    # lookup keywords in n-words window (for controling the window, we set up the max number of word window)
    def keyword_lookup(x,keywords,maxngram,label):
        threshold=0
        for i in range(maxngram):
            gramab=ngrams(x[col].lower().split(), i+1)
            for word in gramab:
                wordlist=list(word)
                if any(word1 in wordlist  for word1 in keywords):
                    threshold=threshold+1
        if threshold>0:
            return label
        return Abstain

    def make_keyword_lf(keywords,maxngram,name,label=None):
        return LabelingFunction(
            name=f"keyword_{name}",
            f=keyword_lookup,
            resources=dict(keywords=keywords,maxngram=maxngram,label=label),)
    
    # lookup both group keywords in n-words window (for controling the window, we set up the max number of word window)
    
    def keyword_lookup_b(x,keywords1,keywords2,maxngram,label):
        threshold=0
        for i in range(maxngram):
            gramab=ngrams(x[col].lower().split(), i+1)
            for word in gramab:
                wordlist=list(word)
                if any(word1 in wordlist  for word1 in keywords1) and any(word1 in wordlist  for word1 in keywords2):
                    threshold=threshold+1
        if threshold>0:
            return label
        return Abstain

    def make_keyword_b_lf(keywords1,keywords2,maxngram,name,label=None):
        return LabelingFunction(
            name=f"keyword_b_{name}",
            f=keyword_lookup_b,
            resources=dict(keywords1=keywords1,keywords2=keywords2,maxngram=maxngram,label=label),)
   
    # lookup a group keywords presenting and not presenting in another group of keywords
    
    def not_lookup_b(x,keywords1,keywords2,maxngram,label):
        threshold=0
        for i in range(maxngram):
            gramab=ngrams(x[col].lower().split(), i+1)
            for word in gramab:
                wordlist=list(word)
                if any(word1 in wordlist  for word1 in keywords2):
                    threshold=threshold+1
        threshold1=0
        for i in range(maxngram):
            gramab=ngrams(x[col].lower().split(), i+1)
            for word in gramab:
                wordlist=list(word)
                if any(word1 in wordlist  for word1 in keywords1):
                    threshold1=threshold1+1
        if threshold<1 and threshold1>0:
            return label
        return Abstain

    
    def make_not_b_lf(keywords1,keywords2,name,maxngram,label=None):
        return LabelingFunction(
            name=f"not_b_{name}",
            f=not_lookup_b,
            resources=dict(keywords1=keywords1,keywords2=keywords2,maxngram=maxngram,label=label),)   
    
    
      # lookup up number in sentence 
    
    
    def number_lookup(x,keywords,label):
        threshold=0
        sentlist=x[col].split()
        trialindex=[i for i, e in enumerate(sentlist) if e in keywords]
        nuindex=[i for i, e in enumerate(sentlist) if str(e).isdigit()]
        for i in trialindex:
            for j in nuindex:
                if i<j:
                    threshold+=1
        if threshold>0:
            return label
        return Abstain

    def make_number_lf(keywords,label=None):
        return LabelingFunction(
            name=f"number",
            f=number_lookup,
            resources=dict(keywords=keywords,label=label),)

   
    
    Abstain = -1
    
    keywordfu=make_keyword_lf(keywords=keywordslist,maxngram=maxngramnu,name='keyword',label=1)    
    virusfu=make_keyword_lf(keywords=viruslist,maxngram=maxngramnu,name='virus',label=1)
    keywordfub=make_keyword_b_lf(keywords1=keywordslist,keywords2=mustlist,maxngram=maxngramnu,name='keyword',label=1)    
    virusfub=make_keyword_b_lf(keywords1=viruslist,keywords2=mustlist,maxngram=maxngramnu,name='virus',label=1)
    trialfu=make_keyword_lf(keywords=triallist,name='trial',maxngram=maxngramnu,label=1)
    notfu1=make_not_b_lf(keywords1=mustlist,keywords2=keywordslist,maxngram=maxngramnu,name='notkeyword',label=0)
    notfu2=make_not_b_lf(keywords1=keywordslist,keywords2=viruslist,maxngram=maxngramnu,name='notvirus',label=0)
    notfu3=make_not_b_lf(keywords1=triallist,keywords2=iruslist,maxngram=maxngramnu,name='notvirus1',label=0)
    numberfu=make_number_lf(keywords=triallist,label=1)
    
    allweaklabf=[]

    allweaklabf.append(keywordfu)
    allweaklabf.append(virusfu)
    allweaklabf.append(keywordfub)
    allweaklabf.append(virusfub)
    allweaklabf.append(trialfu)
    allweaklabf.append(notfu1)
    allweaklabf.append(notfu2)
    allweaklabf.append(notfu3)
    allweaklabf.append(numberfu)

    
    return allweaklabf

In [6]:
def predict_prob(df, lfs, cardinality=2):
    """
    Predict probability (label) by applying label functions lfs
    Refined from `snorkel_process` 
    
    input:
        lfs: list of snorkel.LabelingFunction
    output:
        sentences_labeled: pd.Dataframe,  [pid, sid, sent, label, prob ]
    """
    
    applier=PandasLFApplier(lfs=lfs)
    applied=applier.apply(df=df)
    print(LFAnalysis(L=applied, lfs=lfs).lf_summary())
    
    label_model = LabelModel(cardinality=cardinality,verbose=True)
    label_model.fit(applied)
    df['label']=label_model.predict(applied)
    df['prob']=label_model.predict_proba(applied)[:,1]
    
    return df

In [7]:
### make pd.Dataframe for sentences
abstracts, sentences = build_sentence_df()

Load keywords

In [8]:
keywordslist=load_keyword('keywords.txt')
viruslist=load_keyword('viruslist.txt')
triallist=load_keyword('triallist.txt')
mustlist=['hypercoagulable']

Generate labeling functions

In [13]:
allweaklabf=loop_labeling('abstract',keywordslist,viruslist,triallist,mustlist,3)

Apply labeling functions 

In [14]:
abstracts_prob=predict_prob(abstracts,allweaklabf)


  0%|          | 0/7862 [00:00<?, ?it/s][A
  0%|          | 7/7862 [00:00<02:20, 55.91it/s][A
  0%|          | 14/7862 [00:00<02:13, 58.67it/s][A
  0%|          | 20/7862 [00:00<02:19, 56.38it/s][A
  0%|          | 27/7862 [00:00<02:14, 58.45it/s][A
  0%|          | 33/7862 [00:00<02:19, 56.17it/s][A
  0%|          | 38/7862 [00:00<02:25, 53.70it/s][A
  1%|          | 43/7862 [00:00<02:33, 51.09it/s][A
  1%|          | 48/7862 [00:00<02:42, 48.12it/s][A
  1%|          | 53/7862 [00:01<02:52, 45.25it/s][A
  1%|          | 58/7862 [00:01<03:39, 35.51it/s][A
  1%|          | 63/7862 [00:01<03:30, 37.02it/s][A
  1%|          | 69/7862 [00:01<03:14, 40.16it/s][A
  1%|          | 75/7862 [00:01<03:00, 43.19it/s][A
  1%|          | 82/7862 [00:01<02:46, 46.81it/s][A
  1%|          | 87/7862 [00:01<02:53, 44.78it/s][A
  1%|          | 93/7862 [00:01<02:43, 47.38it/s][A
  1%|          | 98/7862 [00:02<03:06, 41.69it/s][A
  1%|▏         | 105/7862 [00:02<02:48, 46.08it/s][A
 

 12%|█▏        | 913/7862 [00:17<02:15, 51.32it/s][A
 12%|█▏        | 919/7862 [00:18<02:26, 47.48it/s][A
 12%|█▏        | 924/7862 [00:18<02:26, 47.42it/s][A
 12%|█▏        | 929/7862 [00:18<02:25, 47.63it/s][A
 12%|█▏        | 934/7862 [00:18<02:24, 47.87it/s][A
 12%|█▏        | 939/7862 [00:18<02:23, 48.21it/s][A
 12%|█▏        | 944/7862 [00:18<02:30, 45.89it/s][A
 12%|█▏        | 950/7862 [00:18<02:23, 48.26it/s][A
 12%|█▏        | 956/7862 [00:18<02:15, 50.85it/s][A
 12%|█▏        | 963/7862 [00:18<02:09, 53.41it/s][A
 12%|█▏        | 969/7862 [00:19<02:13, 51.54it/s][A
 12%|█▏        | 975/7862 [00:19<02:15, 50.66it/s][A
 12%|█▏        | 981/7862 [00:19<02:21, 48.50it/s][A
 13%|█▎        | 987/7862 [00:19<02:19, 49.42it/s][A
 13%|█▎        | 992/7862 [00:19<02:19, 49.40it/s][A
 13%|█▎        | 998/7862 [00:19<02:16, 50.22it/s][A
 13%|█▎        | 1006/7862 [00:19<02:03, 55.66it/s][A
 13%|█▎        | 1012/7862 [00:19<02:05, 54.64it/s][A
 13%|█▎        | 1019/7862

 23%|██▎       | 1798/7862 [00:37<01:59, 50.57it/s][A
 23%|██▎       | 1804/7862 [00:37<02:00, 50.44it/s][A
 23%|██▎       | 1810/7862 [00:38<01:58, 50.94it/s][A
 23%|██▎       | 1816/7862 [00:38<02:05, 48.33it/s][A
 23%|██▎       | 1821/7862 [00:38<02:12, 45.63it/s][A
 23%|██▎       | 1826/7862 [00:38<02:09, 46.52it/s][A
 23%|██▎       | 1834/7862 [00:38<01:54, 52.42it/s][A
 23%|██▎       | 1840/7862 [00:38<01:57, 51.11it/s][A
 23%|██▎       | 1846/7862 [00:38<01:58, 50.70it/s][A
 24%|██▎       | 1853/7862 [00:38<01:50, 54.59it/s][A
 24%|██▎       | 1860/7862 [00:39<01:47, 55.80it/s][A
 24%|██▎       | 1866/7862 [00:39<01:51, 53.68it/s][A
 24%|██▍       | 1872/7862 [00:39<01:51, 53.89it/s][A
 24%|██▍       | 1878/7862 [00:39<01:50, 54.00it/s][A
 24%|██▍       | 1884/7862 [00:39<01:56, 51.38it/s][A
 24%|██▍       | 1890/7862 [00:39<01:57, 50.99it/s][A
 24%|██▍       | 1897/7862 [00:39<01:49, 54.43it/s][A
 24%|██▍       | 1903/7862 [00:39<01:49, 54.66it/s][A
 24%|██▍  

 35%|███▍      | 2739/7862 [00:55<01:36, 52.90it/s][A
 35%|███▍      | 2745/7862 [00:55<01:41, 50.19it/s][A
 35%|███▌      | 2752/7862 [00:55<01:33, 54.83it/s][A
 35%|███▌      | 2758/7862 [00:55<01:37, 52.53it/s][A
 35%|███▌      | 2764/7862 [00:55<01:38, 51.96it/s][A
 35%|███▌      | 2771/7862 [00:56<01:36, 52.63it/s][A
 35%|███▌      | 2777/7862 [00:56<01:39, 51.30it/s][A
 35%|███▌      | 2783/7862 [00:56<01:39, 51.10it/s][A
 35%|███▌      | 2789/7862 [00:56<01:41, 49.77it/s][A
 36%|███▌      | 2795/7862 [00:56<01:42, 49.23it/s][A
 36%|███▌      | 2802/7862 [00:56<01:34, 53.70it/s][A
 36%|███▌      | 2808/7862 [00:56<01:36, 52.57it/s][A
 36%|███▌      | 2814/7862 [00:56<01:41, 49.91it/s][A
 36%|███▌      | 2820/7862 [00:56<01:39, 50.77it/s][A
 36%|███▌      | 2827/7862 [00:57<01:32, 54.48it/s][A
 36%|███▌      | 2833/7862 [00:57<01:37, 51.32it/s][A
 36%|███▌      | 2839/7862 [00:57<01:33, 53.55it/s][A
 36%|███▌      | 2848/7862 [00:57<01:23, 59.97it/s][A
 36%|███▋ 

 47%|████▋     | 3669/7862 [01:13<01:14, 56.11it/s][A
 47%|████▋     | 3675/7862 [01:13<01:17, 54.00it/s][A
 47%|████▋     | 3681/7862 [01:13<01:16, 54.54it/s][A
 47%|████▋     | 3687/7862 [01:13<01:17, 54.08it/s][A
 47%|████▋     | 3693/7862 [01:13<01:16, 54.20it/s][A
 47%|████▋     | 3699/7862 [01:13<01:15, 55.48it/s][A
 47%|████▋     | 3705/7862 [01:13<01:18, 52.75it/s][A
 47%|████▋     | 3712/7862 [01:13<01:15, 54.62it/s][A
 47%|████▋     | 3719/7862 [01:14<01:12, 57.37it/s][A
 47%|████▋     | 3725/7862 [01:14<01:18, 52.97it/s][A
 47%|████▋     | 3731/7862 [01:14<01:19, 52.07it/s][A
 48%|████▊     | 3737/7862 [01:14<01:21, 50.64it/s][A
 48%|████▊     | 3743/7862 [01:14<01:19, 52.06it/s][A
 48%|████▊     | 3749/7862 [01:14<01:23, 49.15it/s][A
 48%|████▊     | 3754/7862 [01:14<01:24, 48.68it/s][A
 48%|████▊     | 3760/7862 [01:14<01:21, 50.13it/s][A
 48%|████▊     | 3766/7862 [01:15<01:23, 49.09it/s][A
 48%|████▊     | 3772/7862 [01:15<01:19, 51.37it/s][A
 48%|████▊

 58%|█████▊    | 4554/7862 [01:33<03:30, 15.75it/s][A
 58%|█████▊    | 4559/7862 [01:33<02:47, 19.72it/s][A
 58%|█████▊    | 4564/7862 [01:33<02:19, 23.57it/s][A
 58%|█████▊    | 4570/7862 [01:33<01:57, 28.02it/s][A
 58%|█████▊    | 4575/7862 [01:33<01:43, 31.73it/s][A
 58%|█████▊    | 4581/7862 [01:33<01:30, 36.43it/s][A
 58%|█████▊    | 4586/7862 [01:33<01:22, 39.54it/s][A
 58%|█████▊    | 4591/7862 [01:33<01:18, 41.48it/s][A
 58%|█████▊    | 4597/7862 [01:33<01:12, 44.81it/s][A
 59%|█████▊    | 4603/7862 [01:34<01:08, 47.79it/s][A
 59%|█████▊    | 4609/7862 [01:34<01:12, 45.08it/s][A
 59%|█████▊    | 4615/7862 [01:34<01:09, 46.72it/s][A
 59%|█████▉    | 4621/7862 [01:34<01:06, 48.93it/s][A
 59%|█████▉    | 4628/7862 [01:34<01:02, 52.08it/s][A
 59%|█████▉    | 4634/7862 [01:34<01:00, 53.56it/s][A
 59%|█████▉    | 4640/7862 [01:34<00:58, 54.65it/s][A
 59%|█████▉    | 4647/7862 [01:34<00:57, 56.31it/s][A
 59%|█████▉    | 4653/7862 [01:35<00:58, 55.16it/s][A
 59%|█████

 69%|██████▉   | 5430/7862 [01:52<00:48, 50.09it/s][A
 69%|██████▉   | 5436/7862 [01:53<00:49, 49.18it/s][A
 69%|██████▉   | 5441/7862 [01:53<00:50, 47.69it/s][A
 69%|██████▉   | 5447/7862 [01:53<00:49, 48.90it/s][A
 69%|██████▉   | 5452/7862 [01:53<00:51, 47.14it/s][A
 69%|██████▉   | 5459/7862 [01:53<00:47, 51.09it/s][A
 70%|██████▉   | 5465/7862 [01:53<00:48, 49.41it/s][A
 70%|██████▉   | 5471/7862 [01:53<00:46, 51.50it/s][A
 70%|██████▉   | 5478/7862 [01:53<00:45, 52.81it/s][A
 70%|██████▉   | 5484/7862 [01:53<00:47, 50.28it/s][A
 70%|██████▉   | 5490/7862 [01:54<00:47, 49.69it/s][A
 70%|██████▉   | 5496/7862 [01:54<00:47, 49.38it/s][A
 70%|██████▉   | 5501/7862 [01:54<00:49, 47.78it/s][A
 70%|███████   | 5507/7862 [01:54<00:46, 50.56it/s][A
 70%|███████   | 5513/7862 [01:54<00:44, 53.04it/s][A
 70%|███████   | 5519/7862 [01:54<00:46, 50.35it/s][A
 70%|███████   | 5526/7862 [01:54<00:43, 53.11it/s][A
 70%|███████   | 5532/7862 [01:54<00:46, 50.30it/s][A
 70%|█████

 81%|████████  | 6335/7862 [02:11<00:29, 52.63it/s][A
 81%|████████  | 6341/7862 [02:11<00:29, 50.93it/s][A
 81%|████████  | 6347/7862 [02:12<00:30, 49.72it/s][A
 81%|████████  | 6353/7862 [02:12<00:29, 51.94it/s][A
 81%|████████  | 6359/7862 [02:12<00:28, 52.47it/s][A
 81%|████████  | 6365/7862 [02:12<00:28, 52.09it/s][A
 81%|████████  | 6371/7862 [02:12<00:28, 52.85it/s][A
 81%|████████  | 6377/7862 [02:12<00:29, 50.04it/s][A
 81%|████████  | 6385/7862 [02:12<00:27, 54.33it/s][A
 81%|████████▏ | 6391/7862 [02:12<00:28, 52.48it/s][A
 81%|████████▏ | 6397/7862 [02:12<00:27, 54.17it/s][A
 81%|████████▏ | 6403/7862 [02:13<00:26, 54.11it/s][A
 82%|████████▏ | 6409/7862 [02:13<00:27, 52.26it/s][A
 82%|████████▏ | 6415/7862 [02:13<00:28, 50.46it/s][A
 82%|████████▏ | 6421/7862 [02:13<00:29, 48.29it/s][A
 82%|████████▏ | 6427/7862 [02:13<00:28, 50.33it/s][A
 82%|████████▏ | 6434/7862 [02:13<00:27, 51.85it/s][A
 82%|████████▏ | 6440/7862 [02:13<00:27, 51.23it/s][A
 82%|█████

 92%|█████████▏| 7231/7862 [02:30<00:12, 50.90it/s][A
 92%|█████████▏| 7237/7862 [02:30<00:12, 48.24it/s][A
 92%|█████████▏| 7244/7862 [02:30<00:12, 50.83it/s][A
 92%|█████████▏| 7250/7862 [02:30<00:11, 52.40it/s][A
 92%|█████████▏| 7256/7862 [02:30<00:11, 51.57it/s][A
 92%|█████████▏| 7262/7862 [02:31<00:11, 53.29it/s][A
 92%|█████████▏| 7268/7862 [02:31<00:11, 52.55it/s][A
 93%|█████████▎| 7274/7862 [02:31<00:11, 51.42it/s][A
 93%|█████████▎| 7280/7862 [02:31<00:11, 50.36it/s][A
 93%|█████████▎| 7286/7862 [02:31<00:11, 48.60it/s][A
 93%|█████████▎| 7292/7862 [02:31<00:11, 49.77it/s][A
 93%|█████████▎| 7298/7862 [02:31<00:10, 51.55it/s][A
 93%|█████████▎| 7304/7862 [02:31<00:11, 47.44it/s][A
 93%|█████████▎| 7309/7862 [02:32<00:11, 47.25it/s][A
 93%|█████████▎| 7316/7862 [02:32<00:10, 49.96it/s][A
 93%|█████████▎| 7322/7862 [02:32<00:11, 48.62it/s][A
 93%|█████████▎| 7328/7862 [02:32<00:10, 50.62it/s][A
 93%|█████████▎| 7334/7862 [02:32<00:10, 50.86it/s][A
 93%|█████

                   j Polarity  Coverage  Overlaps  Conflicts
keyword_keyword    0      [1]  0.138387  0.138387   0.031417
keyword_virus      1      [1]  0.082040  0.066268   0.009540
keyword_b_keyword  2       []  0.000000  0.000000   0.000000
keyword_b_virus    3      [1]  0.000636  0.000636   0.000636
keyword_trial      4      [1]  0.717756  0.462605   0.001018
not_b_notkeyword   5      [0]  0.001145  0.001145   0.001145
not_b_nottrial     6      [0]  0.031417  0.031417   0.031417
number             7      [1]  0.401933  0.401933   0.000254


Save it

In [16]:
abstracts_prob=abstracts_prob.sort_values(by='prob',ascending=False)
abstracts_prob.to_pickle(save_path+pseudo_label_file_name)