In [None]:
!pip install selenium
!apt-get update
!apt install -y chromium-chromedriver
!pip install webdriver-manager

In [None]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
def chrome(url):
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  browser = webdriver.Chrome(options=options)
  browser.get(f"{url}")
  time.sleep(0.1)

  soup = BeautifulSoup(browser.page_source, 'html.parser')
  return soup

In [None]:
def get_speciality_links():
  soup = chrome('https://prodoctorov.ru/moskva/vrach')

  specialties = []

  for item in soup.find_all('li', class_='p-doctors-list-page__tab-item'):
      link = item.find('a')
      count = int(item.find('span', class_="p-doctors-list-page__tab-item-count p-doctors-list-page__tab-item-count_bg_none p-doctors-list-page__tab-item-count_subtitle-secondary").text.replace('\xa0', ''))
      if link:
          specialties.append([count,'https://prodoctorov.ru' + link['href']])

  return specialties

In [None]:
def get_links_of_doctors_from_site(specialties):
  links_from_all_pages = []

  for speciality in specialties:
    num = 0
    page = 0
    while num < int(speciality[0]):
        page+=1
        url = speciality[1] + f'?page={page}'
        soup = chrome(url)

        doctors_data = []
        doctor_cards = soup.find_all('a',class_='b-doctor-card__name-link text-wrap')
        doctor_links = [card.get('href') for card in doctor_cards]

        base_url = 'https://prodoctorov.ru'
        full_doctor_links = [base_url + link for link in doctor_links]
        links_from_all_pages += full_doctor_links
        num += len(full_doctor_links)
        if num >= speciality[0]:
          break

  return links_from_all_pages

In [None]:
def get_clinics(clinics):
  clinics_info = []
  for clinic in clinics:

    clinic_info = {}

    name = clinic.find('a', class_="text-subtitle-1 primary--text text-decoration-none")
    clinic_info['name'] = name.get_text(strip=True) if name else 'No value'

    address = clinic.find('div', class_="d-flex align-center text-body-1 primary--text py-2 cursor-pointer mt-4")
    clinic_info['address'] = address.get_text(strip=True) if address else 'No value'

    metro = clinic.find('span',{'data-qa': 'metro_name'})
    clinic_info['metro'] = metro.get_text(strip=True) if metro else 'No value'

    clinics_info.append(clinic_info)
  return clinics_info

In [None]:
def get_reviews(reviews):
  reviews_info = []
  for review in reviews:

    review_info = {}

    rate = review.find('span', class_="text-subtitle-2 text--text ml-1")
    review_info['rate'] = float(rate.get_text(strip=True)) if rate else 'No value'

    date = review.find('div', class_="text-body-2 text-secondary--text mb-5")
    review_info['date'] = date.get('content') if date else 'No value'

    comment = review.find('div', class_="b-review-card__comment text-body-1 text--text mt-2")
    review_info['comment'] = comment.get_text(strip=True) if comment else 'No value'


    clinic = review.find('div', class_="b-review-card__address")
    review_info['clinic'] = clinic.get_text(strip=True) if clinic else 'No value'

    reviews_info.append(review_info)
  return reviews_info


In [None]:
def get_doctors_info(link):
    soup = chrome(link)
    doctor_info = {}

    name = soup.find('span', class_="d-block text-h5 text--text mb-2")
    doctor_info['name'] = name.get_text(strip=True) if name else 'No value'

    doctor_info['link'] = link

    speciality = soup.find('div', class_='b-doctor-intro__specs')
    spec = ','.join([s.strip() for s in speciality.get_text(separator=',').split(',')if s.strip()]).lower()
    doctor_info['speciality'] = spec if speciality else 'No value'

    clinics = soup.find_all('div', class_="doctor-page-list-lpu pa-6")
    doctor_info['clinics'] = get_clinics(clinics) if clinics else 'No value'

    price = soup.find('div', class_='text-h6')
    if price:
        price_text = price.get_text(strip=True)
        price_match = re.search(r'\d+', price_text)
        if price_match:
            doctor_info['price'] = float(price_match.group())
        else:
            doctor_info['price'] = 'No value'
    else:
        doctor_info['price'] = 'No value'

    experience = soup.find('div', class_='text-subtitle-1')
    if experience:
        exp_text = experience.get_text(strip=True)
        exp_match = re.search(r'\d+', exp_text)
        if exp_match:
            doctor_info['experience'] = int(exp_match.group())
        else:
            doctor_info['experience'] = 'No value'
    else:
        doctor_info['experience'] = 'No value'

    rating = soup.find('div', class_='text-h5')
    doctor_info['rating'] = float(rating.text.strip()) if rating else 'No value'

    review_count = soup.find('span', class_='ml-2')
    doctor_info['review_count'] = float(review_count.text.strip().split()[0]) if review_count else 'No value'

    reviews = soup.find_all('div', {"itemprop":"review"})
    doctor_info['reviews'] = get_reviews(reviews) if reviews else 'No value'

    doctor_info['is_kids'] = True if 'детский' in spec else False
    if sum(list(map(lambda x: True if 'детский' not in x else False, spec.split(',')))) != 0:
      doctor_info['is_adults'] = True
    else:
      doctor_info['is_adults'] = False

    return doctor_info




In [None]:
kardiolog = get_links_of_doctors_from_site([[3343, 'https://prodoctorov.ru/moskva/kardiolog/'],[352, 'https://prodoctorov.ru/moskva/detskiy-kardiolog/']])
kardiologs = ','.join(kardiolog)
with open('kardiologs.txt', 'w') as f:
  f.write(kardiologs)

In [None]:
kardiologs = open('/content/kardiologs.txt').readline().split(',')
len(kardiologs)

In [None]:
data_kardiologs_1 = []

n = 0
for doc in kardiologs[:1000]:
  n+=1
  data_kardiologs_3.append(get_doctors_info(doc))
  print(f'{n}/1000')

df = pd.DataFrame.from_records(data_kardiologs_1)
df.to_csv('data_kardiologs_0_1000.csv', index=False)

In [None]:
data_kardiologs_2 = []

n = 0
for doc in kardiologs[1000:2500]:
  n+=1
  data_kardiologs_2.append(get_doctors_info(doc))
  print(f'{n}/1500')

df = pd.DataFrame.from_records(data_kardiologs_2)
df.to_csv('data_kardiologs_1000_2500.csv', index=False)

In [None]:
data_kardiologs_3 = []

n = 0
for doc in kardiologs[2500:]:
  n+=1
  data_kardiologs_3.append(get_doctors_info(doc))
  print(f'{n}/1195')

df = pd.DataFrame.from_records(data_kardiologs_3)
df.to_csv('data_kardiologs_2500_3695.csv', index=False)