<a href="https://colab.research.google.com/github/tsantosh7/Unsupervised-Supervised-Protein-Genes-Diseases-Organisms-Extraction/blob/master/Notebooks/post_EuropePMC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests

def post_query_to_europepmc():
    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST"
    query_params = {
        "query": "ext_id:'21342591' OR ext_id:'15684057'",
        "resultType": "lite",
        "pageSize": 10,
        "format":"json"
    }

    try:
        response = requests.post(url, data=query_params)
        response.raise_for_status()  # Check for any errors in the response

        # Process the response data
        data = response.json()

        # Do something with the data...
        # For example, print the titles of the first 10 articles returned
        for article in data["resultList"]["result"]:
            print(article["title"])

    except requests.exceptions.RequestException as e:
        print(f"Error occurred: {e}")

if __name__ == "__main__":
    post_query_to_europepmc()

In [None]:
url = "https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST"
query_params = {
    "query": 'ext_id:21342591 OR ext_id:15684057 OR PMC:10034906',
    # "resultType": "core",
    "pageSize": 100,
    "format":"json"
}

In [None]:
response = requests.post(url, data=query_params)

In [None]:
response.json()

{'version': '6.8',
 'hitCount': 3,
 'request': {'queryString': 'ext_id:21342591 OR ext_id:15684057 OR PMC:10034906',
  'resultType': 'lite',
  'cursorMark': '*',
  'pageSize': 100,
  'sort': '',
  'synonym': False},
 'resultList': {'result': [{'id': '36968214',
    'source': 'MED',
    'pmid': '36968214',
    'pmcid': 'PMC10034906',
    'fullTextIdList': {'fullTextId': ['PMC10034906']},
    'doi': '10.1007/s10664-023-10289-9',
    'title': 'Visualising data science workflows to support third-party notebook comprehension: an empirical study.',
    'authorString': 'Ramasamy D, Sarasua C, Bacchelli A, Bernstein A.',
    'journalTitle': 'Empir Softw Eng',
    'issue': '3',
    'journalVolume': '28',
    'pubYear': '2023',
    'journalIssn': '1382-3256; 1573-7616; ',
    'pageInfo': '58',
    'pubType': 'research-article; journal article',
    'isOpenAccess': 'Y',
    'inEPMC': 'Y',
    'inPMC': 'N',
    'hasPDF': 'Y',
    'hasBook': 'N',
    'hasSuppl': 'N',
    'citedByCount': 0,
    'has

In [None]:
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import argparse
import requests
import json

In [None]:
# Code to extract meta-data from Europe PMC APIs
# (c) EMBL-EBI, June 2023
#
# Started: 9 Septmember  2020
# Updated: 27 July  2023
# _author_ = 'Santosh Tirunagari'
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import argparse
import requests
import json

# API URL and parameters


def process_each_file_in_job_per_article(each_file_path, result_path):
  url = "https://www.ebi.ac.uk/europepmc/webservices/rest/searchPOST"
  all_list=[]

  # Path(result_path).mkdir(parents=True, exist_ok=True)
  file_name = 'meta_' + each_file_path.split('/')[-1]
    # Open input and output files
  with open(each_file_path, "r") as input_file:
      # Loop through each line in the input file
      for line in input_file:
          # Parse JSON line and extract pmcid
          json_data = json.loads(line.strip())
          if json_data['pmid']:
            all_list.append(json_data['pmid'])



  chunked_lists = [all_list[i:i + 100] for i in range(0, len(all_list), 100)]
  # Initialize a list to store the extracted data
  extracted_data_list = []

  for chunk in tqdm(chunked_lists):
    query_statement =''
    for each_id in chunk:
      if each_id.startswith('PMC'):
        query_statement = query_statement+' PMC:'+str(each_id[3:])+' OR'
      else:
        query_statement = query_statement+' ext_id:'+str(each_id)+' OR'
    # Query parameters for API request
    query_params = {
        "query": query_statement[:-3].strip(),
        "pageSize": 100,
        "format": "json"
        }
    # Make the API request
    response = requests.post(url, data=query_params)
    meta_result_json = response.json()

    # Extract and append the required data to the extracted_data_list
    for result in meta_result_json['resultList']['result']:
        try:
          pmcid=result['pmcid']
        except:
          pmcid=''

        try:
          pmid=result['pmid']
        except:
          pmid=''

        try:
          doi=result['doi']
        except:
          doi=''

        try:
          journalTitle=result['journalTitle']
        except:
          journalTitle=''

        try:
          title=result['title']
        except:
          title=''


        extracted_data = {
            'pmid': pmid,
            'pmcid': pmcid,
            'doi': doi,
            'title': title,
            'journalTitle': journalTitle
        }
        extracted_data_list.append(extracted_data)


    # Write the extracted data as a JSONL file
  with open(result_path+'/'+file_name, 'w') as file:
      for extracted_data in extracted_data_list:
          json.dump(extracted_data, file)
          file.write('\n')

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='This script will process patch jsonlines to extract meta data from Europe PMC APIs')
    parser.add_argument("-f", "--file", nargs=1, required=True,
                        help="OTAR New Pipeline GP DS CD extractor to Jsonl format", metavar="PATH")
    parser.add_argument("-o", "--out", nargs=1, required=True, help="output folder", metavar="PATH")
    # parser.add_argument("-l", "--lookup", nargs=1, required=True,
    #                     help="TSV file with PMID, PMCID, Publication date etc. information", metavar="PATH")
    # parser.add_argument("-d", "--document", nargs=1, required=True,
    #                     help="Document Type, f for Full text and a for abstracts", metavar="PATH")

    args = parser.parse_args()

    process_each_file_in_job_per_article(args.file[0], args.out[0])
    print(args.file[0] + ' : text extraction finished!')

