### Parse WoS .txt, and save to .json
- parse wos .txt
- parse in batches
- dump to .json

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import re
import json

In [2]:
# Parse WoS .txt files to dicts
def parse_wos_txt(file_path):
    # Read WoS .txt file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into individual records
    records = content.strip().split("\nER\n")
    records = [record.strip() + "\nER" for record in records if record.strip()]

    # Extract the fields
    papers = []
    for record in tqdm(records, desc='Progress: '):
        paper = {}
        lines = record.split('\n')
        for line in lines:
            if line.startswith('PT '):
                paper['PT'] = line[3:]

            elif line.startswith('AU '):
                aus = []
                aus.append(line[3:])
                # Read all subsequent AU lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        aus.append(next_line.strip())
                    else:
                        break
                paper['AU'] = '; '.join(aus)  

            elif line.startswith('AF '):
                afs = []
                afs.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        afs.append(next_line.strip())
                    else:
                        break
                paper['AF'] = '; '.join(afs)     

            elif line.startswith('TI '):
                tis = []
                tis.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        tis.append(next_line.strip())
                    else:
                        break
                paper['TI'] = ' '.join(tis)    

            elif line.startswith('SO '):
                sos = []
                sos.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        sos.append(next_line.strip())
                    else:
                        break
                paper['SO'] = ' '.join(sos)  

            elif line.startswith('LA '):
                paper['LA'] = line[3:]   

            elif line.startswith('DT '):
                paper['DT'] = line[3:]     

            elif line.startswith('DE '):
                des = []
                des.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        des.append(next_line.strip())
                    else:
                        break
                paper['DE'] = ' '.join(des) 

            elif line.startswith('ID '):
                ids = []
                ids.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        ids.append(next_line.strip())
                    else:
                        break
                paper['ID'] = ' '.join(ids)

            elif line.startswith('AB '):
                abs = []
                abs.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        abs.append(next_line.strip())
                    else:
                        break
                paper['AB'] = ' '.join(abs)                                                

            elif line.startswith('C1 '):
                c1s = []
                c1s.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        c1s.append(next_line.strip())
                    else:
                        break
                paper['C1'] = ';'.join(c1s)    

            elif line.startswith('C3 '):
                c3s = []
                c3s.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        c3s.append(next_line.strip())
                    else:
                        break
                paper['C3'] = ' '.join(c3s)        

            elif line.startswith('RP '):
                rps = []
                rps.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        rps.append(next_line.strip())
                    else:
                        break
                paper['RP'] = ' '.join(rps)       

            elif line.startswith('EM '):
                ems = []
                ems.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        ems.append(next_line.strip())
                    else:
                        break
                paper['EM'] = ' '.join(ems)    

            elif line.startswith('FU '):
                fus = []
                fus.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        fus.append(next_line.strip())
                    else:
                        break
                paper['FU'] = ' '.join(fus)        

            elif line.startswith('FX '):
                fxs = []
                fxs.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        fxs.append(next_line.strip())
                    else:
                        break
                paper['FX'] = ' '.join(fxs)         

            elif line.startswith('CR '):
                crs = []
                crs.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        crs.append(next_line.strip())
                    else:
                        break
                paper['CR'] = ';'.join(crs)   

            elif line.startswith('NR '):
                paper['NR'] = line[3:]

            elif line.startswith('TC '):
                paper['TC'] = line[3:]

            elif line.startswith('Z9 '):
                paper['Z9'] = line[3:]   

            elif line.startswith('U1 '):
                paper['U1'] = line[3:]

            elif line.startswith('U2 '):
                paper['U2'] = line[3:]   

            elif line.startswith('PU '):
                paper['PU'] = line[3:]

            elif line.startswith('PI '):
                paper['PI'] = line[3:]              

            elif line.startswith('PA '):
                paper['PA'] = line[3:] 

            elif line.startswith('SN '):
                paper['SN'] = line[3:]    

            elif line.startswith('EI '):
                paper['EI'] = line[3:]        

            elif line.startswith('J9 '):
                paper['J9'] = line[3:]      

            elif line.startswith('JI '):
                paper['JI'] = line[3:]      

            elif line.startswith('PD '):
                paper['PD'] = line[3:]      

            elif line.startswith('PY '):
                paper['PY'] = line[3:]   

            elif line.startswith('VL '):
                paper['VL'] = line[3:] 

            elif line.startswith('IS '):
                paper['IS'] = line[3:]    

            elif line.startswith('BP '):
                paper['BP'] = line[3:]   

            elif line.startswith('EP '):
                paper['EP'] = line[3:]   

            elif line.startswith('AR '):
                paper['AR'] = line[3:]  

            elif line.startswith('DI '):
                paper['DI'] = line[3:]  

            elif line.startswith('EA '):
                paper['EA'] = line[3:] 

            elif line.startswith('PG '):
                paper['PG'] = line[3:] 

            elif line.startswith('WC '):
                wcs = []
                wcs.append(line[3:])
                # Read all subsequent AF lines until the next field starts
                for next_line in lines[lines.index(line) + 1:]:
                    if next_line.startswith((' ', '\t')):
                        wcs.append(next_line.strip())
                    else:
                        break
                paper['WC'] = ' '.join(wcs)  

            elif line.startswith('WE '):
                paper['WE'] = line[3:]          

            elif line.startswith('SC '):
                paper['SC'] = line[3:]  

            elif line.startswith('GA '):
                paper['GA'] = line[3:]  

            elif line.startswith('UT '):
                paper['UT'] = line[3:]  

            elif line.startswith('PM '):
                paper['PM'] = line[3:]  

            elif line.startswith('OA '):
                paper['OA'] = line[3:]    

            elif line.startswith('DA '):
                paper['DA'] = line[3:]  

        papers.append(paper)
    
    return papers

In [3]:
# Parsing wos .txt files in batches
def parse_wos_txt_batches(os_path): 
    # (os_path is exactly the path to the folder containing all WoS .txt files)
    # read the os path
    fn_exs = os.listdir(os_path)

    # Spliter filename and extension
    fns = []
    for fn_ex in fn_exs:
        fn, ex = os.path.splitext(fn_ex)
        if ex == '.txt':
            fns.append(fn)

    # Get all the papers for each author as dict
    papers = []
    for fn in fns:
        fn_txt_path = f'{os_path}\{fn}.txt'
        papers += parse_wos_txt(fn_txt_path)

    return papers

In [4]:
# Try the test data
os_path = 'original_data\Citation Laureates_test'
papers = parse_wos_txt_batches(os_path)

Progress: 100%|██████████| 588/588 [00:00<00:00, 1577.20it/s]
Progress: 100%|██████████| 432/432 [00:00<00:00, 2215.52it/s]
Progress: 100%|██████████| 408/408 [00:00<00:00, 2199.26it/s]
Progress: 100%|██████████| 174/174 [00:00<00:00, 2220.23it/s]
Progress: 100%|██████████| 259/259 [00:00<00:00, 1686.21it/s]


In [5]:
len(papers)

1861

In [6]:
papers[0]

{'PT': 'J',
 'AU': "O'Connell, RP; Liaw, K; Wellhausen, N; Chuckran, CA; Bhojnagarwala, PS; Bordoloi, D; Park, D; Shupin, N; Kulp, D; June, CH; Weiner, D",
 'AF': "O'Connell, Ryan P.; Liaw, Kevin; Wellhausen, Nils; Chuckran, Christopher A.; Bhojnagarwala, Pratik S.; Bordoloi, Devivasha; Park, Daniel; Shupin, Nicholas; Kulp, Daniel; June, Carl H.; Weiner, David",
 'TI': 'Format-tuning of in vivo-launched bispecific T cell engager enhances efficacy against renal cell carcinoma',
 'SO': 'JOURNAL FOR IMMUNOTHERAPY OF CANCER',
 'LA': 'English',
 'DT': 'Article',
 'DE': 'Renal Cell Carcinoma; Antibody; Bispecific T cell engager - BiTE; Kidney Cancer',
 'ID': 'CARBONIC-ANHYDRASE-IX; ANTIBODY; EXPRESSION; DELIVERY',
 'AB': "Background Advanced clear cell renal cell carcinoma (ccRCC) is a prevalent kidney cancer for which long-term survival rates are abysmal, though immunotherapies are showing potential. Not yet clinically vetted are bispecific T cell engagers (BTEs) that activate T cell-mediat

In [24]:
# Save dict to json
with open('wos_data.json', 'w') as f:
    json.dump(papers, f)

In [None]:
# Read dict from json
with open('wos_data.json', 'r') as f:
    wos_data = json.load(f)