## Installations and Imports

In [1]:
! pip install requests -q
! pip install html5lib -q
! pip install bs4 -q
! pip install pdfminer.six -q
! pip install tiktoken -q
! pip install fake-useragent -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.8/125.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [59]:
import requests
from bs4 import BeautifulSoup
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import pandas as pd
import matplotlib.pyplot as plt
import chardet
from io import BytesIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.high_level import extract_text
import tiktoken
from fake_useragent import UserAgent
import copy
from typing import List, Dict
import random
from google.colab import files
from threading import Semaphore
import re

## Getting the List of Medical Journals

In [52]:
base_url = "https://turkmedline.net/dergi-listesi"
params = {
    "searchkw": "",
    "action": "search"
}

In [8]:
res = requests.get(base_url, params=params)
soup = BeautifulSoup(res.content, "xml")

In [None]:
classes = soup.find_all(class_ = "text-decoration-none indeks-dergi-link")

In [43]:
links = []
for cls in classes:
  link = cls.get("href")
  link = link.replace("dergi-listesi", "")
  links.append(link)

In [70]:
def get_dergipark_sets(base_url, links):
  sets = []
  for link in links:
    try:
      full_url = base_url + link
      res = requests.get(full_url)
      soup = BeautifulSoup(res.content, "html")
      href = soup.find("a", href=re.compile("dergipark"))
      href = href.get("href")   # parse
      sets.append(href)
    except:
      pass
  return sets

sets = get_dergipark_sets(base_url, links)

In [73]:
def get_set_names(sets):
  set_names = []
  for s in sets:
    res = s.split("/")
    set_name = res[-1]   # last element is the dergipark set_name
    set_names.append(set_name)
  return set_names

set_names = get_set_names(sets)

In [79]:
json_obj = json.dumps(set_names)
with open("med_set_names.json", "w") as outfile:
  outfile.write(json_obj)

## Fetch Article Links

In [87]:
tur_records = []

def fetch_record(set_name):
    base_url = "https://dergipark.org.tr/api/public/oai/"
    params = {
        "verb": "ListRecords",
        "metadataPrefix": "oai_dc",
        "set": set_name
    }

    with requests.Session() as session:
        session.headers.update({'User-agent': 'your bot 0.1'})
        count = 0
        while True:
            try:
                res = session.get(base_url, params=params, timeout=10) # timeout is to prevent error 429: too many requests
                res.raise_for_status()
                soup = BeautifulSoup(res.content, "xml")

                records = soup.find_all("record")
                for record in records:
                    lang = record.find("dc:language")
                    if lang and lang.text == "tur":
                        relation = record.find("dc:relation")
                        identifier = record.find("dc:identifier")
                        title = record.find("dc:title")
                        date = record.find("dc:date")
                        year = date.text.strip().split("-")[0] if date else None
                        year = int(year) if year else 0
                        if (year>=2020 and relation and title):
                          rec = {
                                "set_name": set_name,
                                "year": year,
                                "title": title.text.strip(),
                                "dergipark_url": identifier.text.strip() if identifier else None,
                                "pdf_url": relation.text.strip(),
                            }
                          tur_records.append(rec)
                          count += 1
                resumption_token = soup.find("resumptionToken")
                if resumption_token and resumption_token.text.strip():
                    params = {
                        "verb": "ListRecords",
                        "resumptionToken": resumption_token.text.strip()
                    }
                else:
                    break
            except requests.exceptions.RequestException as e:
                print(f"Error fetching set {set_name}: {e}")
                time.sleep(5)  # retry after a delay
                continue

    print(f"Finished fetching journal #{set_names.index(set_name)}: {set_name}")

In [None]:
for st in set_names:
  fetch_record(st)

In [96]:
df = pd.DataFrame.from_records(tur_records, index=None)
df.to_pickle("tur_records_links.pkl")

## Scraping the Text from Records