In [None]:
! pip install requests -q
! pip install html5lib -q
! pip install bs4 -q
! pip install tiktoken -q

In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import pandas as pd
import tiktoken
from google.colab import files

Some info about the website
- has a total of 378 pages (1.22.2025)
- each page has exactly 12 posts so expected total would be ≈4500 (considering filtration)
- each article has it's own category and on it's page so categorical pagination won't be a problem (though you can browse with categories on the website), we will just scrape the category name and text content at the same time

# Getting Post URLs

In [None]:
base_url = "https://www.medicalpark.com.tr"
guide_url = base_url + "/saglik-rehberi"

In [None]:
res = requests.get(base_url)
soup = BeautifulSoup(res.content, "html.parser")

In [None]:
links = []

def get_urls(i, base_url):
  param = {
      "pg": i+1
  }
  res = requests.get(base_url, param)
  soup = BeautifulSoup(res.content, "html.parser")
  divs = soup.find_all("div", {"class": "col-md-3 no-padding"})
  for div in divs:
    a = div.find("a")
    topic = a.text.strip()
    href = a["href"]
    rec = {
        "topic": topic,
        "href": href
    }
    links.append(rec)
    print(f"link got successfully for {topic}")

In [None]:
def get_all_urls(num_workers, len_page, base_url):
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [
        executor.submit(get_urls, i, base_url) for i in range(len_page)
    ]
    for future in as_completed(futures):
      try:
        future.result()
      except Exception as e:
        print(f"Thread Error: {e}")

In [None]:
get_all_urls(10, 378, guide_url)

In [None]:
len(links)    # all the records are fetched

4536

# Filtering Data


**By Link** : <br/>
It was seen that some links contain location values which indicates that the corresponding post may be an advertisement of an hospital in the said location. To keep the data unbiased, those links are be dropped.

In [None]:
locations = ["Adana", "Ankara", "Antalya", "Bursa", "Canakkale", "Kocaeli", "Mersin", "Ordu", "Samsun", "Tokat", "İstanbul"]
filtered_links = [link for link in links if not any(loc.lower() in link["href"] for loc in locations)]

In [None]:
len(filtered_links)     # 24 records were dropped

4512

**By Topic** : <br/>
Further inspection showed that some article titles contain the word "paket" (package) or "servis" (service) which strongly implys an advertisement.

In [None]:
ad_words = ["Paket", "Hizmet"] + locations   # may add some words later
filtered_links = [link for link in filtered_links if not any(word in link["topic"] for word in ad_words)]

In [None]:
len(filtered_links)     # 41 records were dropped

4471

*Conclusion* : 65 records were dropped in total

# Getting Text Data

After tyring a sample ULR, the relevant texts are found in [2 : -3] (-3 not included). <br/>
All pages follow the same template, so this indexing applies to all records.

In [None]:
raw_data = []

def get_raw_html(base_url, record):
  topic = record["topic"]
  param = record["href"]
  url = base_url + param
  res = requests.get(url)
  soup = BeautifulSoup(res.content, "html.parser")
  a = soup.find("a", {"id": "ContentPlaceHolder1_navHlthDepartment"})
  div = soup.find("div", {"class": "postContent"})
  text_elements = div.find_all(["h2", "p"])
  category = a.text
  rec = {
      "category": category,
      "topic": topic,
      "raw_text": text_elements
  }
  raw_data.append(rec)
  print(f"successfuly extracted raw data for {topic}")

In [None]:
def get_all_raw_html(max_workers, base_url, records):
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(get_raw_html, base_url, record) for record in records
    ]
    for future in as_completed(futures):
      try:
        future.result()
      except Exception as e:
        print(f"Thread Error: {e}")

In [None]:
get_all_raw_html(10, base_url, filtered_links)

In [None]:
len(raw_data)     # lost 1 record -> Thread Error: 'NoneType' object has no attribute 'find_all'

4470

In [None]:
categories = []
topics = []

for data in raw_data:
  cat = data["category"]
  top = data["topic"]
  categories.append(cat)
  topics.append(top)

In [None]:
def get_text(raw_data):
  texts = []
  for d in raw_data:
    data = d["raw_text"].copy()
    data = data[2:-3]
    text = ""
    for element in data:
      text += element.text.strip()
    texts.append(" " + text)
  return texts

In [None]:
def num_tokens_from_string(string: str, encoder_name: str) -> int:
    # encoding = tiktoken.encoding_for_model(model_name)
    encoding = tiktoken.get_encoding(encoder_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
def get_num_tokens(encoder_name, texts):
  num_tokens = []
  for text in texts:
    num_token = num_tokens_from_string(text, encoder_name)
    num_tokens.append(num_token)
  return num_tokens

In [None]:
texts = get_text(raw_data)

In [None]:
len(texts)    # no record loss

4470

In [None]:
encoder = "o200k_base"
num_tokens = get_num_tokens(encoder, texts)

In [None]:
data = {
    "department": categories,
    "topic": topics,
    "text": texts,
    "num_tokens": num_tokens
}

In [None]:
df = pd.DataFrame(data=data, index=None)

In [None]:
df.head()

Unnamed: 0,department,topic,text,num_tokens
0,Genel Cerrahi,Ülserovejetan Kitle Belirtileri Nelerdir? Ülse...,Vücutta oluşan anormal kitleler insanlar için...,3540
1,Gastroenteroloji,Rektumdan (Makattan) Kan Gelmesi​ Neden Olur?,Birbirinden farklı pek çok rahatsızlık rektum...,4617
2,Dermatoloji (Cildiye),Yatak Böceği Isırığına Ne İyi Gelir?,"Yatak böceği ısırığı, gece hareketli olan yat...",5005
3,Gastroenteroloji,Segmenter Kolon Rezeksiyonu Nedir?,"Segmenter kolon rezeksiyonu; kanser, divertik...",4486
4,Dermatoloji (Cildiye),Vatoz Sokması Anında Ne Yapılmalıdır? Vatoz So...,Denizlerin derinliklerinde yüzmek keyif veric...,3244


In [None]:
df["num_tokens"].sum()    # total number of tokens

19031323

In [None]:
df["department"].value_counts()

Unnamed: 0_level_0,count
department,Unnamed: 1_level_1
Beslenme ve Diyet,948
Kadın Hastalıkları ve Doğum,339
İç Hastalıkları (Dahiliye),318
Dermatoloji (Cildiye),311
Ortopedi ve Travmatoloji,234
...,...
Çocuk Kardiyolojisi,1
Mikrobiyoloji,1
Çocuk Ürolojisi,1
Çocuk Onkolojisi,1


In [None]:
# df.to_json("MedData_P2.json")
df.to_csv("MedData_P2.csv")

# .csv file takes less storage